From: Sanjay Patel Date: Sun, 9 Sep 2018 14:13:22 +0000 (+0000) Subject: [SelectionDAG] enhance vector demanded elements to look at a vector select condition... X-Git-Tag: llvmorg-8.0.0-rc1~9145 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=6ebf218e4c6e168f7e34101018121a67c2625123;p=platform%2Fupstream%2Fllvm.git [SelectionDAG] enhance vector demanded elements to look at a vector select condition operand This is the DAG equivalent of D51433. If we know we're not using all vector lanes, use that knowledge to potentially simplify a vselect condition. The reduction/horizontal tests show that we are eliminating AVX1 operations on the upper half of 256-bit vectors because we don't need those anyway. I'm not sure what the pr34592 test is showing. That's run with -O0; is SimplifyDemandedVectorElts supposed to be running there? Differential Revision: https://reviews.llvm.org/D51696 llvm-svn: 341762 --- diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a0402eb..eb44560 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1532,12 +1532,20 @@ bool TargetLowering::SimplifyDemandedVectorElts( break; } case ISD::VSELECT: { - APInt DemandedLHS(DemandedElts); - APInt DemandedRHS(DemandedElts); - - // TODO - add support for constant vselect masks. + // Try to transform the select condition based on the current demanded + // elements. + // TODO: If a condition element is undef, we can choose from one arm of the + // select (and if one arm is undef, then we can propagate that to the + // result). + // TODO - add support for constant vselect masks (see IR version of this). + APInt UnusedUndef, UnusedZero; + if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UnusedUndef, + UnusedZero, TLO, Depth + 1)) + return true; // See if we can simplify either vselect operand. + APInt DemandedLHS(DemandedElts); + APInt DemandedRHS(DemandedElts); APInt UndefLHS, ZeroLHS; APInt UndefRHS, ZeroRHS; if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedLHS, UndefLHS, diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll index 4619e8e..198d2ba 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -469,9 +469,6 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -548,9 +545,6 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper @@ -1159,9 +1153,6 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -1283,9 +1274,6 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll index 9728379..fc4db53 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -472,9 +472,6 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -552,9 +549,6 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper @@ -1163,9 +1157,6 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -1287,9 +1278,6 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index fe8612b..9f26285 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -535,12 +535,8 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -631,12 +627,8 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper @@ -1270,12 +1262,8 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -1422,12 +1410,8 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index 66dd8fd..516e25e 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -473,12 +473,8 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -571,12 +567,8 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper @@ -1172,12 +1164,8 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 -; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 +; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx @@ -1326,12 +1314,8 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 -; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 +; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax ; X64-AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll index 86ce7b9..b04f338 100644 --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -19,31 +19,30 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-NEXT: vmovaps 80(%rbp), %ymm13 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm14 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm15 -; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] ; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6 -; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm8[2,3,4,5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3,4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] ; CHECK-NEXT: # kill: def $xmm9 killed $xmm9 killed $ymm9 -; CHECK-NEXT: vmovdqa %xmm9, %xmm11 -; CHECK-NEXT: # kill: def $ymm11 killed $xmm11 -; CHECK-NEXT: vpalignr {{.*#+}} ymm6 = ymm2[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,0] +; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovdqa %xmm9, %xmm0 +; CHECK-NEXT: # kill: def $ymm0 killed $xmm0 +; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0] ; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # implicit-def: $ymm0 ; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] ; CHECK-NEXT: vmovaps %xmm2, %xmm9 ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] ; CHECK-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; CHECK-NEXT: vmovaps %xmm7, %xmm9 ; CHECK-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7] ; CHECK-NEXT: # implicit-def: $ymm6 ; CHECK-NEXT: vmovaps %xmm9, %xmm6 +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,3] ; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] @@ -56,9 +55,9 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1 ; CHECK-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm6, %ymm3 ; CHECK-NEXT: vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm14, (%rsp) # 32-byte Spill ; CHECK-NEXT: movq %rbp, %rsp diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll index a1b72fc..a83bbce 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -158,9 +158,6 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -343,9 +340,6 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -645,9 +639,6 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll index ec4ab82..91d0c15 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -157,9 +157,6 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -342,9 +339,6 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -644,9 +638,6 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll index 69e1dcd..8b57591 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -164,12 +164,8 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -364,12 +360,8 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -693,10 +685,6 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll index fa8c1cb..c9e1ef5 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -163,12 +163,8 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -363,12 +359,8 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -692,10 +684,6 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper