From e652c0f8f3e7c7a1b42edf22cfc5bbfd597fd164 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 4 Jul 2020 10:26:56 -0700 Subject: [PATCH] [X86] Teach lowerShuffleAsBlend to use bit blend for v16i8/v32i8/v16i16 when avx512vl is enabled but not avx512bw. Probably not super important since there are no real CPUs with avx512vl and not avx512bw. But vpternlog should be better than vblendvb. I do wonder if we should use vpternlog even with BWI. We currently use vblendmb or vpblendmw by putting the mask into a GPR and moving it to a k-register. But I don't think we hoist the GPR to k-register copy in machine LICM. Using VPTERNLOG would use a constant pool load, but has the advantage that we're pretty good at hoisting and rematerializing those. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D83156 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 ++++++ llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll | 13 ++++++------- llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 9 +++++---- llvm/test/CodeGen/X86/vector-fshl-128.ll | 4 ++-- llvm/test/CodeGen/X86/vector-fshl-256.ll | 4 ++-- llvm/test/CodeGen/X86/vector-fshr-128.ll | 6 +++--- llvm/test/CodeGen/X86/vector-fshr-256.ll | 6 +++--- llvm/test/CodeGen/X86/vector-shuffle-v48.ll | 19 +++++++++---------- 8 files changed, 36 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 83ad55a..88a5637 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11731,6 +11731,12 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } + // If we have VPTERNLOG, we can use that as a bit blend. + if (Subtarget.hasVLX()) + if (SDValue BitBlend = + lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) + return BitBlend; + // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 4647231..0706254 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -148,18 +148,17 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX256VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX256VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,12,13,u,u,8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23,u,u,30,31,16,17] +; AVX256VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,12,13],zero,zero,ymm1[8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23],zero,zero,ymm1[30,31,16,17] ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z} ; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2 ; AVX256VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 ; AVX256VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,1] -; AVX256VL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] -; AVX256VL-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm2 -; AVX256VL-NEXT: vpslld $31, %ymm2, %ymm2 -; AVX256VL-NEXT: vptestmd %ymm2, %ymm2, %k1 -; AVX256VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX256VL-NEXT: vpternlogq $220, {{.*}}(%rip), %ymm1, %ymm2 +; AVX256VL-NEXT: vpmovsxwd %xmm2, %ymm1 +; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1 +; AVX256VL-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1 ; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1 ; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0 diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index d8f1e54..6b49f22 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -1345,10 +1345,11 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { ; ; AVX512VL-LABEL: negative: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512VL-NEXT: vpternlogq $206, %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,3,2,3] ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 7cd454f..b2ad1b3 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -2905,8 +2905,8 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm2, %zmm2 ; AVX512VL-NEXT: vpord %zmm1, %zmm2, %zmm1 ; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] +; AVX512VL-NEXT: vpternlogq $216, %xmm2, %xmm1, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 678ae18..674b064 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -2376,8 +2376,8 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] +; AVX512VL-NEXT: vpternlogq $216, %ymm2, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 35ba0e8..23fbc5e 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -2651,9 +2651,9 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 ; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm2 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [18446744073709551360,18446744073709551360] +; AVX512VL-NEXT: vpternlogq $202, %xmm1, %xmm2, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index ad29afe..bd5698b 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -2083,9 +2083,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512VL-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] +; AVX512VL-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll index 5cd35af..e854923 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v48.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v48.ll @@ -52,17 +52,16 @@ define <32 x i8> @foo(<48 x i8>* %x0) { ; ; AVX512F-LABEL: foo: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqu (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[0,2,3,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero,ymm0[24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: foo: -- 2.7.4