From 59fabf9c606ecc69342463e514faefdfa2361be6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 8 Aug 2019 13:23:53 +0000 Subject: [PATCH] [X86][SSE] matchBinaryPermuteShuffle - split INSERTPS combines We need to prefer INSERTPS with zeros over SHUFPS, but fallback to INSERTPS if that fails. llvm-svn: 368292 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 25 +++++++++++------ llvm/test/CodeGen/X86/oddshuffles.ll | 48 ++++++++++++++++----------------- llvm/test/CodeGen/X86/pr29112.ll | 30 ++++++++++----------- 3 files changed, 54 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1229c99..6c2ebe1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31901,15 +31901,15 @@ static bool matchBinaryPermuteShuffle( } } - // Attempt to combine to INSERTPS. + // Attempt to combine to INSERTPS, but only if it has elements that need to + // be set to zero. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && - MaskVT.is128BitVector()) { - if (Zeroable.getBoolValue() && - matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { - Shuffle = X86ISD::INSERTPS; - ShuffleVT = MVT::v4f32; - return true; - } + MaskVT.is128BitVector() && + llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) && + matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + Shuffle = X86ISD::INSERTPS; + ShuffleVT = MVT::v4f32; + return true; } // Attempt to combine to SHUFPD. @@ -31971,6 +31971,15 @@ static bool matchBinaryPermuteShuffle( } } + // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed. + if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && + MaskVT.is128BitVector() && + matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { + Shuffle = X86ISD::INSERTPS; + ShuffleVT = MVT::v4f32; + return true; + } + return false; } diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 9284408..b8a5b87 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1240,40 +1240,38 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 ; ; SSE42-LABEL: interleave_24i32_out: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqu 80(%rdi), %xmm9 -; SSE42-NEXT: movdqu 64(%rdi), %xmm10 +; SSE42-NEXT: movups 80(%rdi), %xmm8 +; SSE42-NEXT: movdqu 64(%rdi), %xmm9 ; SSE42-NEXT: movdqu (%rdi), %xmm4 ; SSE42-NEXT: movdqu 16(%rdi), %xmm2 -; SSE42-NEXT: movdqu 32(%rdi), %xmm11 +; SSE42-NEXT: movups 32(%rdi), %xmm10 ; SSE42-NEXT: movdqu 48(%rdi), %xmm5 -; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,1,0,1] -; SSE42-NEXT: movdqa %xmm2, %xmm7 -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1] +; SSE42-NEXT: movdqa %xmm2, %xmm6 +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] ; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[2,3] -; SSE42-NEXT: blendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3] -; SSE42-NEXT: movdqa %xmm10, %xmm1 +; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[1] +; SSE42-NEXT: movdqa %xmm9, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,0,1] -; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm10[2,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,2,2] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,0,3,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm5[6,7] +; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm9[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[1] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,2,2] +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm3[6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,0,3,3] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,5],xmm5[6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm2[2,3],xmm6[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3],xmm0[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,3] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5],xmm3[6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3],xmm0[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE42-NEXT: movdqu %xmm3, 16(%rsi) +; SSE42-NEXT: movups %xmm5, 16(%rsi) ; SSE42-NEXT: movups %xmm4, (%rsi) -; SSE42-NEXT: movdqu %xmm5, 16(%rdx) -; SSE42-NEXT: movdqu %xmm7, (%rdx) +; SSE42-NEXT: movdqu %xmm3, 16(%rdx) +; SSE42-NEXT: movdqu %xmm6, (%rdx) ; SSE42-NEXT: movdqu %xmm2, 16(%rcx) ; SSE42-NEXT: movdqu %xmm1, (%rcx) ; SSE42-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll index 6ce9da9..a9e99e4 100644 --- a/llvm/test/CodeGen/X86/pr29112.ll +++ b/llvm/test/CodeGen/X86/pr29112.ll @@ -11,9 +11,9 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 ; CHECK-NEXT: vmovaps %xmm1, %xmm9 -; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17] -; CHECK-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm15 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17] +; CHECK-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14 ; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,30,1,22,4,30,1,22,4,30,1,22,4,30,1,22] ; CHECK-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10 @@ -27,7 +27,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6 ; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 @@ -39,20 +39,19 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm3[1,0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[1] -; CHECK-NEXT: vaddps %xmm1, %xmm3, %xmm8 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8 ; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3],xmm11[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm13[0] -; CHECK-NEXT: vaddps %xmm15, %xmm2, %xmm2 -; CHECK-NEXT: vmovaps %xmm14, %xmm1 -; CHECK-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vaddps %xmm10, %xmm14, %xmm10 -; CHECK-NEXT: vaddps %xmm14, %xmm14, %xmm3 -; CHECK-NEXT: vaddps %xmm12, %xmm15, %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] +; CHECK-NEXT: vaddps %xmm14, %xmm2, %xmm2 +; CHECK-NEXT: vmovaps %xmm13, %xmm1 +; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10 +; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3 +; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0 ; CHECK-NEXT: vaddps %xmm8, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm14, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0 ; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm10, (%rsp) ; CHECK-NEXT: vmovaps %xmm9, %xmm3 @@ -65,7 +64,6 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> - %a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> %a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> -- 2.7.4