From 6195ed83978c7f97d2c415d1903d7a33ccd8b47f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 29 Sep 2019 01:24:16 +0000 Subject: [PATCH] [X86] Match (or (and A, B), (andn (A, C))) to VPTERNLOG with AVX512. This uses a similar isel pattern as we used for vpcmov with XOP. llvm-svn: 373154 --- llvm/lib/Target/X86/X86InstrAVX512.td | 43 +++++++++++++++++++ llvm/test/CodeGen/X86/vector-fshl-512.ll | 10 ++--- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll | 64 +++++++++++----------------- llvm/test/CodeGen/X86/vector-fshr-512.ll | 10 ++--- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll | 64 +++++++++++----------------- llvm/test/CodeGen/X86/vector-rotate-512.ll | 58 +++++++++---------------- 6 files changed, 122 insertions(+), 127 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 4d0a9cd..60bcf3e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -11533,6 +11533,49 @@ let Predicates = [HasVLX] in { (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; } +let Predicates = [HasVLX] in { + def : Pat<(v16i8 (or (and VR128X:$src1, VR128X:$src2), + (X86andnp VR128X:$src1, VR128X:$src3))), + (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, (i8 202))>; + def : Pat<(v8i16 (or (and VR128X:$src1, VR128X:$src2), + (X86andnp VR128X:$src1, VR128X:$src3))), + (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, (i8 202))>; + def : Pat<(v4i32 (or (and VR128X:$src1, VR128X:$src2), + (X86andnp VR128X:$src1, VR128X:$src3))), + (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, (i8 202))>; + def : Pat<(v2i64 (or (and VR128X:$src1, VR128X:$src2), + (X86andnp VR128X:$src1, VR128X:$src3))), + (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, (i8 202))>; + + def : Pat<(v32i8 (or (and VR256X:$src1, VR256X:$src2), + (X86andnp VR256X:$src1, VR256X:$src3))), + (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 202))>; + def : Pat<(v16i16 (or (and VR256X:$src1, VR256X:$src2), + (X86andnp VR256X:$src1, VR256X:$src3))), + (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 202))>; + def : Pat<(v8i32 (or (and VR256X:$src1, VR256X:$src2), + (X86andnp VR256X:$src1, VR256X:$src3))), + (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 202))>; + def : Pat<(v4i64 (or (and VR256X:$src1, VR256X:$src2), + (X86andnp VR256X:$src1, VR256X:$src3))), + (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 202))>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(v64i8 (or (and VR512:$src1, VR512:$src2), + (X86andnp VR512:$src1, VR512:$src3))), + (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, (i8 202))>; + def : Pat<(v32i16 (or (and VR512:$src1, VR512:$src2), + (X86andnp VR512:$src1, VR512:$src3))), + (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, (i8 202))>; + def : Pat<(v16i32 (or (and VR512:$src1, VR512:$src2), + (X86andnp VR512:$src1, VR512:$src3))), + (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, (i8 202))>; + def : Pat<(v8i64 (or (and VR512:$src1, VR512:$src2), + (X86andnp VR512:$src1, VR512:$src3))), + (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, (i8 202))>; +} + //===----------------------------------------------------------------------===// // AVX-512 - FixupImm //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index 9f2cad9..ebaf22a 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -1548,16 +1548,12 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpandn %ymm1, %ymm4, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm4, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index f630231..540ad19 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -172,47 +172,39 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6 -; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6 -; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm8 -; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm8 -; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm8, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9 -; AVX512VL-NEXT: vpor %ymm4, %ymm9, %ymm4 +; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5 +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpandn %ymm3, %ymm7, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -826,16 +818,12 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 893260e..7c20ec8f 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -1532,16 +1532,12 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpandn %ymm1, %ymm4, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm4, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 84d14b6..4e39a5a 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -181,50 +181,42 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6 -; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6 -; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpsubb %ymm2, %ymm6, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpsubb %ymm2, %ymm4, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpandn %ymm4, %ymm8, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm9 -; AVX512VL-NEXT: vpand %ymm8, %ymm9, %ymm9 -; AVX512VL-NEXT: vpor %ymm4, %ymm9, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm5 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm8 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpternlogq $226, %ymm5, %ymm9, %ymm8 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10 -; AVX512VL-NEXT: vpor %ymm4, %ymm10, %ymm4 +; AVX512VL-NEXT: vpor %ymm5, %ymm10, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5 +; AVX512VL-NEXT: vpsubb %ymm1, %ymm4, %ymm1 ; AVX512VL-NEXT: vpand %ymm7, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpandn %ymm3, %ymm8, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm9, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm9, %ymm3, %ymm3 +; AVX512VL-NEXT: vpand %ymm8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -846,16 +838,12 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 2c3dbd8..2e03525 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -167,44 +167,36 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6 -; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6 -; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm6, %ymm5 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512VL-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7 -; AVX512VL-NEXT: vpand %ymm6, %ymm7, %ymm7 -; AVX512VL-NEXT: vpor %ymm4, %ymm7, %ymm4 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 ; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpandn %ymm3, %ymm6, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 ; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 @@ -815,16 +807,12 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -964,18 +952,14 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpandn %ymm4, %ymm3, %ymm4 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm3, %ymm0 ; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq -- 2.7.4