We should be using the funnel shift instructions for vXi16 types.
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v8i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpsllvw %xmm1, %xmm0, %xmm2
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512VLVBMI2-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm3, %ymm0, %ymm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VLVBMI2-NEXT: vzeroupper
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VLVBMI2-NEXT: vzeroupper
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14]
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,15,14,13,12,11,10,9]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VLVBMI2-NEXT: vzeroupper
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vprolq $14, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolq $14, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_funnnel_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vprotq $14, %xmm0, %xmm0
; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_funnnel_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_funnnel_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vprotb $4, %xmm0, %xmm0
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_funnnel_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_funnnel_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v16i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLVBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_funnnel_v16i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_funnnel_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_funnnel_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_funnnel_v16i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolq $14, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
; AVX512VLBW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: var_funnnel_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubw %zmm1, %zmm4, %zmm1
+; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubw %zmm1, %zmm4, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
ret <32 x i16> %res
}
; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: var_funnnel_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k2
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k2
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k2}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm2, %zmm5
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm2, %zmm5
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
ret <64 x i8> %res
}
; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
%res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
ret <32 x i16> %res
; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
+; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
ret <64 x i8> %res
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
ret <32 x i16> %res
}
; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VLVBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
ret <64 x i8> %res
}
; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsrlw $9, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsllw $7, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlw $9, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsllw $7, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
ret <32 x i16> %res
}
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
ret <64 x i8> %res
}
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v2i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v2i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v2i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v2i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v2i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v2i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_funnnel_v2i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd $4, %xmm0, %xmm0
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
; AVX512VLBW-NEXT: vprorvq %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvq %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v8i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm2, %xmm1
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpsllvw %xmm1, %xmm0, %xmm2
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512VLVBMI2-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm3, %ymm0, %ymm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VLVBMI2-NEXT: vzeroupper
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprorvq %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vprorvq %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v2i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v4i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %ymm0, %ymm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VLVBMI2-NEXT: vzeroupper
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v16i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprorvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14]
+; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
+; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,15,14,13,12,11,10,9]
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VLVBMI2-NEXT: vzeroupper
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vprorq $14, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprorq $14, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorq $14, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_funnnel_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vprotq $50, %xmm0, %xmm0
; AVX512VLBW-NEXT: vprord $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprord $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprord $4, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_funnnel_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd $28, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm1
+; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_funnnel_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vprotb $4, %xmm0, %xmm0
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
; AVX512VLBW-NEXT: vprorvq %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvq %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_funnnel_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vprorvd %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvd %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_funnnel_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v16i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsubw %ymm1, %ymm2, %ymm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v16i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpsubw %ymm1, %ymm2, %ymm1
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLVBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_funnnel_v16i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %ymm1, %ymm4, %ymm1
+; AVX512VLVBMI2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vprorvq %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512VLVBMI2-NEXT: vprorvq %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX512VLBW-NEXT: vprorvd %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX512VLVBMI2-NEXT: vprorvd %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprorvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
+; AVX512VBMI2-NEXT: vprorvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_funnnel_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
; AVX512VLBW-NEXT: vprorvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
+; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_funnnel_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v16i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_funnnel_v16i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VLBW-NEXT: vprorq $14, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprorq $14, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorq $14, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_funnnel_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq $50, %xmm0, %xmm1
; AVX512VLBW-NEXT: vprord $4, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprord $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprord $4, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_funnnel_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd $28, %xmm0, %xmm1
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: var_funnnel_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubw %zmm1, %zmm4, %zmm1
+; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubw %zmm1, %zmm4, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %amt)
ret <32 x i16> %res
}
; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: var_funnnel_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1
+; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
+; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k2}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm4, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %amt)
ret <64 x i8> %res
}
; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <32 x i16> %amt, <32 x i16> undef, <32 x i32> zeroinitializer
%res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> %splat)
ret <32 x i16> %res
; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
+; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpsllw %xmm1, %xmm5, %xmm0
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
+; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; AVX512VLVBMI2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %xmm5, %xmm0
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
ret <64 x i8> %res
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: constant_funnnel_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>)
ret <32 x i16> %res
}
; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLVBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
ret <64 x i8> %res
}
; AVX512VLBW-NEXT: vpsrlw $7, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $9, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsrlw $7, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $9, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $7, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %x, <32 x i16> %x, <32 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
ret <32 x i16> %res
}
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
ret <64 x i8> %res
}
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_funnnel_v2i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_funnnel_v2i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_funnnel_v2i32:
; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprorvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_funnnel_v2i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vprorvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_funnnel_v2i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX512VLBW-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,u,u>
+; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_funnnel_v2i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprorvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_funnnel_v2i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vprord $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprord $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprord $4, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_funnnel_v2i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd $28, %xmm0, %xmm0
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_rotate_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_rotate_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_rotate_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_rotate_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_rotate_v8i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v8i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpsllvw %xmm1, %xmm0, %xmm2
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VLVBMI2-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512VLVBMI2-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_rotate_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_rotate_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VLVBMI2-NEXT: vzeroupper
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: var_rotate_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_rotate_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_rotate_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_rotate_v2i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_rotate_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_rotate_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_rotate_v4i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_rotate_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_rotate_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VLVBMI2-NEXT: vzeroupper
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_rotate_v16i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_rotate_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14]
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_rotate_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_rotate_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_rotate_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_rotate_v8i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v8i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_rotate_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_rotate_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VLVBMI2-NEXT: vzeroupper
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: constant_rotate_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: vprolq $14, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolq $14, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_rotate_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vprotq $14, %xmm0, %xmm0
; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_rotate_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_rotate_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vprotb $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v2i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolq $15, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v2i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolq $15, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_rotate_mask_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vprotq $15, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v4i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v4i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_rotate_mask_v4i32:
; XOP: # %bb.0:
; XOP-NEXT: vprotd $4, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsrlw $11, %xmm0, %xmm1
+; AVX512VBMI2-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $5, %xmm0, %xmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $11, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_rotate_mask_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vprotw $5, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOP-LABEL: splatconstant_rotate_mask_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vprotb $4, %xmm0, %xmm0
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2
; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_rotate_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_rotate_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_rotate_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_rotate_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_rotate_v16i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v16i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm2
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLVBMI2-NEXT: vpsubw %ymm1, %ymm3, %ymm1
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_rotate_v16i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: var_rotate_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: var_rotate_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_rotate_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_rotate_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512VLVBMI2-NEXT: vprolvq %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_rotate_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_rotate_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_rotate_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %ymm1
+; AVX512VLVBMI2-NEXT: vprolvd %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_rotate_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatvar_rotate_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatvar_rotate_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_rotate_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
+; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_rotate_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_rotate_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
+; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_rotate_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_rotate_v16i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v16i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_rotate_v16i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: constant_rotate_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: constant_rotate_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolq $14, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v4i64:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprolq $15, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v4i64:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprolq $15, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i32:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i32:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vprold $4, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsrlw $11, %ymm0, %ymm1
+; AVX512VBMI2-NEXT: vpsllw $5, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $5, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsrlw $11, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm0
; AVX512VLBW-NEXT: retq
;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm0
+; AVX512VLVBMI2-NEXT: retq
+;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
;
; Variable Rotates
; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: var_rotate_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
%shl = shl <32 x i16> %a, %b
%lshr = lshr <32 x i16> %a, %b16
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: var_rotate_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm1
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
+; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k2
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: var_rotate_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %zmm1, %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm4
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k2
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
%shl = shl <64 x i8> %a, %b
%lshr = lshr <64 x i8> %a, %b8
; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatvar_rotate_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %zmm1
+; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
%splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
%shl = shl <32 x i16> %a, %splat
; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatvar_rotate_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
+; AVX512VBMI2-NEXT: vpandq %zmm2, %zmm3, %zmm2
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VBMI2-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatvar_rotate_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq %zmm2, %zmm3, %zmm2
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
%splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
%shl = shl <64 x i8> %a, %splat
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: constant_rotate_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpshldvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpshldvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
%or = or <32 x i16> %shl, %lshr
; AVX512VLBW-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: constant_rotate_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512VBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: constant_rotate_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VLVBMI2-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm2, %zmm0, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm2, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm1, %zmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm1, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
+; AVX512VLVBMI2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512VLVBMI2-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpackuswb %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
%lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
%or = or <64 x i8> %shl, %lshr
; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatconstant_rotate_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
%or = or <32 x i16> %shl, %lshr
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $216, {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%or = or <64 x i8> %shl, %lshr
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i16:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
%lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
%rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512VBMI2: # %bb.0:
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VBMI2-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v64i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm2
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
+; AVX512VLVBMI2-NEXT: retq
%shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>