From: Simon Pilgrim Date: Mon, 30 Nov 2020 15:59:46 +0000 (+0000) Subject: [X86] Add vbmi2 test coverage for vector rotations X-Git-Tag: llvmorg-13-init~4858 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8fcc8c3148d087e9190cf6858c80f6306b3a93e5;p=platform%2Fupstream%2Fllvm.git [X86] Add vbmi2 test coverage for vector rotations We should be using the funnel shift instructions for vXi16 types. --- diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 96c3759..e48a932 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -7,6 +7,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 @@ -123,6 +125,20 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 @@ -245,6 +261,20 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind { ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 @@ -407,6 +437,28 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v8i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v8i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512VLVBMI2-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 @@ -613,6 +665,41 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpsllvw %ymm3, %ymm0, %ymm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VLVBMI2-NEXT: vzeroupper +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 @@ -724,6 +811,21 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] @@ -845,6 +947,21 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v4i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] @@ -1123,6 +1240,41 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VLVBMI2-NEXT: vzeroupper +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v16i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1249,6 +1401,20 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 @@ -1359,6 +1525,20 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 @@ -1435,6 +1615,24 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v8i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,15,14,13,12,11,10,9] +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 @@ -1578,6 +1776,29 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VLVBMI2-NEXT: vzeroupper +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0 @@ -1656,6 +1877,19 @@ define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512VLBW-NEXT: vprolq $14, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolq $14, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_funnnel_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vprotq $14, %xmm0, %xmm0 @@ -1716,6 +1950,19 @@ define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_funnnel_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd $4, %xmm0, %xmm0 @@ -1823,6 +2070,22 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_funnnel_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vprotb $4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index f463b05..eb07daa 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -5,6 +5,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2 @@ -89,6 +91,19 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -183,6 +198,19 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind { ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -314,6 +342,27 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLVBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -485,6 +534,38 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1 +; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1 +; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -574,6 +655,20 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] @@ -657,6 +752,20 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] @@ -873,6 +982,40 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -954,6 +1097,19 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1 @@ -1033,6 +1189,19 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1 @@ -1108,6 +1277,23 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1 @@ -1261,6 +1447,24 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -1330,6 +1534,18 @@ define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolq $14, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1 @@ -1393,6 +1609,18 @@ define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 @@ -1517,6 +1745,22 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index 6671f3e..069bf36 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -3,6 +3,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) @@ -117,6 +119,30 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: var_funnnel_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubw %zmm1, %zmm4, %zmm1 +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512VLVBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubw %zmm1, %zmm4, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt) ret <32 x i16> %res } @@ -333,6 +359,82 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} ; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: var_funnnel_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k2 +; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2} +; AVX512VBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k2 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2} +; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt) ret <64 x i8> %res } @@ -441,6 +543,36 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat) ret <32 x i16> %res @@ -580,6 +712,50 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat) ret <64 x i8> %res @@ -657,6 +833,20 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: constant_funnnel_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> ) ret <32 x i16> %res } @@ -813,6 +1003,60 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: constant_funnnel_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] +; AVX512VLVBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> ) ret <64 x i8> %res } @@ -877,6 +1121,20 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlw $9, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsllw $7, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlw $9, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsllw $7, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> ) ret <32 x i16> %res } @@ -919,6 +1177,20 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> ) ret <64 x i8> %res } diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll index a5950ff..687f215 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -7,6 +7,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 @@ -112,6 +114,20 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v2i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v2i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v2i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 @@ -244,6 +260,21 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v2i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] @@ -364,6 +395,20 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v2i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v2i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v2i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 @@ -456,6 +501,19 @@ define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_funnnel_v2i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd $4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 64ccaff..8da9d47 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -7,6 +7,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 @@ -123,6 +125,20 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind { ; AVX512VLBW-NEXT: vprorvq %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvq %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -255,6 +271,20 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind { ; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -437,6 +467,32 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind { ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v8i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v8i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm2, %xmm1 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512VLVBMI2-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -649,6 +705,41 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %ymm3, %ymm0, %ymm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VLVBMI2-NEXT: vzeroupper +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -764,6 +855,21 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %amt) nounwind ; AVX512VLBW-NEXT: vprorvq %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vprorvq %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v2i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -896,6 +1002,21 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %amt) nounwind ; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v4i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1199,6 +1320,41 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VLVBMI2-NEXT: vzeroupper +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v16i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1329,6 +1485,20 @@ define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512VLBW-NEXT: vprorvq {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 @@ -1439,6 +1609,20 @@ define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 @@ -1515,6 +1699,24 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x) nounwind { ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v8i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,15,14,13,12,11,10,9] +; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 @@ -1658,6 +1860,29 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VLVBMI2-NEXT: vzeroupper +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0 @@ -1736,6 +1961,19 @@ define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x) nounwind { ; AVX512VLBW-NEXT: vprorq $14, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprorq $14, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorq $14, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_funnnel_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vprotq $50, %xmm0, %xmm0 @@ -1796,6 +2034,19 @@ define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprord $4, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprord $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprord $4, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_funnnel_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd $28, %xmm0, %xmm0 @@ -1903,6 +2154,22 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm1 +; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_funnnel_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vprotb $4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index c917780..856ac94 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -5,6 +5,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2 @@ -89,6 +91,19 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind { ; AVX512VLBW-NEXT: vprorvq %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvq %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -193,6 +208,19 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind { ; AVX512VLBW-NEXT: vprorvd %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvd %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -341,6 +369,31 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpsubw %ymm1, %ymm2, %ymm1 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLVBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -526,6 +579,38 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1 +; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1 +; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -620,6 +705,20 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %amt) nounwind ; AVX512VLBW-NEXT: vprorvq %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VLVBMI2-NEXT: vprorvq %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] @@ -711,6 +810,20 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind ; AVX512VLBW-NEXT: vprorvd %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512VLVBMI2-NEXT: vprorvd %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] @@ -948,6 +1061,40 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1032,6 +1179,19 @@ define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512VLBW-NEXT: vprorvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1 @@ -1111,6 +1271,19 @@ define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprorvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1 @@ -1186,6 +1359,23 @@ define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x) nounwind { ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1 @@ -1339,6 +1529,24 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -1408,6 +1616,18 @@ define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x) nounwind { ; AVX512VLBW-NEXT: vprorq $14, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprorq $14, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorq $14, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_funnnel_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotq $50, %xmm0, %xmm1 @@ -1471,6 +1691,18 @@ define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprord $4, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprord $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprord $4, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_funnnel_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotd $28, %xmm0, %xmm1 @@ -1595,6 +1827,22 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 03dd5c0..9e4b460 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -3,6 +3,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>) declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) @@ -117,6 +119,30 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: var_funnnel_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubw %zmm1, %zmm4, %zmm1 +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubw %zmm1, %zmm4, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt) ret <32 x i16> %res } @@ -333,6 +359,82 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: var_funnnel_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2 +; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2} +; AVX512VBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1 +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2 +; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3 +; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2} +; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt) ret <64 x i8> %res } @@ -441,6 +543,36 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw ; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat) ret <32 x i16> %res @@ -576,6 +708,50 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm4 +; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpsllw %xmm1, %xmm5, %xmm0 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm4 +; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1 +; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm2 +; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %xmm5, %xmm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat) ret <64 x i8> %res @@ -653,6 +829,20 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: constant_funnnel_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> ) ret <32 x i16> %res } @@ -809,6 +999,60 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: constant_funnnel_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512VBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] +; AVX512VLVBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> ) ret <64 x i8> %res } @@ -873,6 +1117,20 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512VLBW-NEXT: vpsrlw $7, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $9, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsrlw $7, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $9, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlw $7, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> ) ret <32 x i16> %res } @@ -915,6 +1173,20 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> ) ret <64 x i8> %res } diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll index 5e6bf01..8201f3d 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -7,6 +7,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 @@ -120,6 +122,20 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind { ; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_funnnel_v2i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_funnnel_v2i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_funnnel_v2i32: ; XOP: # %bb.0: ; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -264,6 +280,21 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind ; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_funnnel_v2i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] @@ -390,6 +421,20 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_funnnel_v2i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u> +; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_funnnel_v2i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_funnnel_v2i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 @@ -482,6 +527,19 @@ define <2 x i32> @splatconstant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VLBW-NEXT: vprord $4, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprord $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprord $4, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_funnnel_v2i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd $28, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index adf9b80..fced1af 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -7,6 +7,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 @@ -106,6 +108,20 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_rotate_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_rotate_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 @@ -228,6 +244,20 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_rotate_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_rotate_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 @@ -393,6 +423,28 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_rotate_v8i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v8i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512VLVBMI2-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_rotate_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 @@ -590,6 +642,35 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_rotate_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VLVBMI2-NEXT: vzeroupper +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: var_rotate_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 @@ -698,6 +779,21 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_rotate_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_rotate_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_rotate_v2i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] @@ -815,6 +911,21 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_rotate_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_rotate_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_rotate_v4i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] @@ -1087,6 +1198,35 @@ define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_rotate_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_rotate_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VLVBMI2-NEXT: vzeroupper +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_rotate_v16i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1216,6 +1356,20 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_rotate_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_rotate_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 @@ -1320,6 +1474,20 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_rotate_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_rotate_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 @@ -1398,6 +1566,24 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { ; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_rotate_v8i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] +; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v8i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_rotate_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 @@ -1543,6 +1729,29 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_rotate_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] +; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VLVBMI2-NEXT: vzeroupper +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: constant_rotate_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0 @@ -1623,6 +1832,19 @@ define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind { ; AVX512VLBW-NEXT: vprolq $14, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolq $14, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_rotate_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vprotq $14, %xmm0, %xmm0 @@ -1683,6 +1905,19 @@ define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind { ; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_rotate_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd $4, %xmm0, %xmm0 @@ -1794,6 +2029,22 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_rotate_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vprotb $4, %xmm0, %xmm0 @@ -1859,6 +2110,20 @@ define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v2i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v2i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolq $15, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_rotate_mask_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: vprotq $15, %xmm0, %xmm0 @@ -1924,6 +2189,20 @@ define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind { ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v4i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vzeroupper +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v4i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_rotate_mask_v4i32: ; XOP: # %bb.0: ; XOP-NEXT: vprotd $4, %xmm0, %xmm0 @@ -1994,6 +2273,21 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { ; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlw $11, %xmm0, %xmm1 +; AVX512VBMI2-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $5, %xmm0, %xmm1 +; AVX512VLVBMI2-NEXT: vpsrlw $11, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_rotate_mask_v8i16: ; XOP: # %bb.0: ; XOP-NEXT: vprotw $5, %xmm0, %xmm0 @@ -2072,6 +2366,23 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { ; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOP-LABEL: splatconstant_rotate_mask_v16i8: ; XOP: # %bb.0: ; XOP-NEXT: vprotb $4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index ec92e63..d5b747e 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -5,6 +5,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2 @@ -76,6 +78,19 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_rotate_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_rotate_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -173,6 +188,19 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_rotate_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_rotate_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -307,6 +335,27 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_rotate_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLVBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_rotate_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -475,6 +524,32 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: var_rotate_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %ymm1, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %ymm1, %ymm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_rotate_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -555,6 +630,20 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_rotate_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_rotate_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_rotate_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] @@ -641,6 +730,20 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_rotate_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_rotate_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_rotate_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] @@ -857,6 +960,34 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatvar_rotate_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatvar_rotate_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -941,6 +1072,19 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_rotate_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] +; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_rotate_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1 @@ -1022,6 +1166,19 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind { ; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_rotate_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] +; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_rotate_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1 @@ -1099,6 +1256,23 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { ; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_rotate_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] +; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_rotate_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1 @@ -1254,6 +1428,24 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: constant_rotate_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_rotate_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -1325,6 +1517,18 @@ define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind { ; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolq $14, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_rotate_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1 @@ -1390,6 +1594,18 @@ define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind { ; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_rotate_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 @@ -1518,6 +1734,22 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_rotate_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 @@ -1585,6 +1817,19 @@ define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind { ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v4i64: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprolq $15, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v4i64: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprolq $15, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1 @@ -1658,6 +1903,19 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind { ; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i32: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i32: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 @@ -1735,6 +1993,21 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind { ; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsrlw $11, %ymm0, %ymm1 +; AVX512VBMI2-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $5, %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlw $11, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1 @@ -1824,6 +2097,23 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 +; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 +; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 690d9f7..13056a9 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -3,6 +3,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 ; ; Variable Rotates @@ -102,6 +104,16 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: var_rotate_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %b16 = sub <32 x i16> , %b %shl = shl <32 x i16> %a, %b %lshr = lshr <32 x i16> %a, %b16 @@ -305,6 +317,76 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: var_rotate_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm4 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm1 +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2 +; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512VBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: var_rotate_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm4 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} +; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: retq %b8 = sub <64 x i8> , %b %shl = shl <64 x i8> %a, %b %lshr = lshr <64 x i8> %a, %b8 @@ -400,6 +482,18 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind ; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatvar_rotate_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %zmm1 +; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer %splat16 = sub <32 x i16> , %splat %shl = shl <32 x i16> %a, %splat @@ -496,6 +590,42 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatvar_rotate_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm4, %xmm2 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 +; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm3, %zmm2 +; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatvar_rotate_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm4, %xmm2 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm3, %zmm2 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer %splat8 = sub <64 x i8> , %splat %shl = shl <64 x i8> %a, %splat @@ -570,6 +700,16 @@ define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind { ; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: constant_rotate_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpshldvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %shl = shl <32 x i16> %a, %lshr = lshr <32 x i16> %a, %or = or <32 x i16> %shl, %lshr @@ -728,6 +868,60 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: constant_rotate_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512VBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512VBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: constant_rotate_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256] +; AVX512VLVBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm2, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1} +; AVX512VLVBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, %lshr = lshr <64 x i8> %a, %or = or <64 x i8> %shl, %lshr @@ -798,6 +992,16 @@ define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind { ; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_rotate_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %shl = shl <32 x i16> %a, %lshr = lshr <32 x i16> %a, %or = or <32 x i16> %shl, %lshr @@ -842,6 +1046,20 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, %lshr = lshr <64 x i8> %a, %or = or <64 x i8> %shl, %lshr @@ -922,6 +1140,18 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i16: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: retq %shl = shl <32 x i16> %a, %lshr = lshr <32 x i16> %a, %rmask = and <32 x i16> %lshr, @@ -972,6 +1202,22 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 ; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq +; +; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512VBMI2: # %bb.0: +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 +; AVX512VBMI2-NEXT: retq +; +; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v64i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, %lshr = lshr <64 x i8> %a, %rmask = and <64 x i8> %lshr,