From 4f41ea20167c8ae5e12887b95104a4697a430203 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 14 Dec 2022 11:41:14 +0000 Subject: [PATCH] [X86] lowerShuffleAsVTRUNC - bit shift the offset elements into place instead of shuffle This helps avoid issues on non-BWI targets which can end up splitting the shuffles to 2 x 256-bit bitshifts of a smaller scalar width --- llvm/lib/Target/X86/X86ISelLowering.cpp | 17 +- .../CodeGen/X86/shuffle-strided-with-offset-256.ll | 62 +- .../CodeGen/X86/shuffle-strided-with-offset-512.ll | 312 +-- llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 61 +- .../X86/vector-interleaved-load-i16-stride-4.ll | 98 +- .../X86/vector-interleaved-load-i8-stride-4.ll | 189 +- .../X86/vector-interleaved-load-i8-stride-8.ll | 2937 ++++++++++---------- llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll | 4 +- llvm/test/CodeGen/X86/x86-interleaved-access.ll | 2 +- 9 files changed, 1671 insertions(+), 2011 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c003dae..c627d366 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12579,18 +12579,17 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2); SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2); - // Move the offset'd elements into place for the truncation. - if (Offset) { - SmallVector OffsetMask(NumElts * 2, -1); - for (unsigned I = 0, E = NumElts * 2; I < E; I += Scale) - OffsetMask[I] = I + Offset; - Src = DAG.getVectorShuffle(ConcatVT, DL, Src, DAG.getUNDEF(ConcatVT), - OffsetMask); - } - MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); Src = DAG.getBitcast(SrcVT, Src); + + // Shift the offset'd elements into place for the truncation. + // TODO: Use getTargetVShiftByConstNode. + if (Offset) + Src = DAG.getNode( + X86ISD::VSRLI, DL, SrcVT, Src, + DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8)); + return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); } } diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll index 02c66e3..52dd300 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -149,7 +149,7 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper @@ -157,8 +157,7 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrld $8, (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -166,7 +165,7 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -174,7 +173,7 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpsrld $8, (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -317,7 +316,7 @@ define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind { ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlq $16, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper @@ -325,7 +324,7 @@ define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $16, (%rdi), %ymm0 +; AVX512VL-NEXT: vpsrlq $16, (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -333,7 +332,7 @@ define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind { ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -341,7 +340,7 @@ define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpsrlq $16, (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -385,7 +384,8 @@ define void @shuffle_v16i16_to_v4i16_2(ptr %L, ptr %S) nounwind { ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufd {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper @@ -393,14 +393,15 @@ define void @shuffle_v16i16_to_v4i16_2(ptr %L, ptr %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7] +; AVX512VL-NEXT: vpsrlq $32, (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -408,7 +409,7 @@ define void @shuffle_v16i16_to_v4i16_2(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7] +; AVX512BWVL-NEXT: vpsrlq $32, (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -502,7 +503,7 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlq $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper @@ -510,8 +511,7 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlq $8, (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -519,7 +519,7 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlq $8, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -527,7 +527,7 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpsrlq $8, (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -552,7 +552,7 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlq $16, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper @@ -560,7 +560,7 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $16, (%rdi), %ymm0 +; AVX512VL-NEXT: vpsrlq $16, (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -568,7 +568,7 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -576,7 +576,7 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpsrlq $16, (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -601,7 +601,7 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlq $24, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper @@ -609,7 +609,7 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $24, (%rdi), %ymm0 +; AVX512VL-NEXT: vpsrlq $24, (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -617,7 +617,7 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlq $24, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -625,7 +625,7 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpsrlq $24, (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -649,7 +649,8 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufd {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) ; AVX512F-NEXT: vzeroupper @@ -657,14 +658,15 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7] +; AVX512VL-NEXT: vpsrlq $32, (%rdi), %ymm0 ; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -672,7 +674,7 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7] +; AVX512BWVL-NEXT: vpsrlq $32, (%rdi), %ymm0 ; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll index 6938b77..489ee1c 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -86,41 +86,12 @@ define void @shuffle_v16i32_to_v8i32_1(ptr %L, ptr %S) nounwind { } define void @shuffle_v64i8_to_v16i8_1(ptr %L, ptr %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v16i8_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $8, (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v16i8_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $8, (%rdi), %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, ptr %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> store <16 x i8> %strided.vec, ptr %S @@ -128,37 +99,12 @@ define void @shuffle_v64i8_to_v16i8_1(ptr %L, ptr %S) nounwind { } define void @shuffle_v64i8_to_v16i8_2(ptr %L, ptr %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v16i8_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, 32(%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, 32(%rdi), %zmm0, %zmm0 -; AVX512VL-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrld $16, (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_2: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v16i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, (%rdi), %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, ptr %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> store <16 x i8> %strided.vec, ptr %S @@ -181,7 +127,7 @@ define void @shuffle_v64i8_to_v16i8_3(ptr %L, ptr %S) nounwind { define void @shuffle_v32i16_to_v8i16_1(ptr %L, ptr %S) nounwind { ; AVX512-LABEL: shuffle_v32i16_to_v8i16_1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, (%rdi), %zmm0 +; AVX512-NEXT: vpsrlq $16, (%rdi), %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -194,7 +140,7 @@ define void @shuffle_v32i16_to_v8i16_1(ptr %L, ptr %S) nounwind { define void @shuffle_v32i16_to_v8i16_2(ptr %L, ptr %S) nounwind { ; AVX512-LABEL: shuffle_v32i16_to_v8i16_2: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512-NEXT: vpsrlq $32, (%rdi), %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -218,43 +164,12 @@ define void @shuffle_v32i16_to_v8i16_3(ptr %L, ptr %S) nounwind { } define void @shuffle_v64i8_to_v8i8_1(ptr %L, ptr %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $8, (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $8, (%rdi), %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, ptr %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, ptr %S @@ -262,39 +177,12 @@ define void @shuffle_v64i8_to_v8i8_1(ptr %L, ptr %S) nounwind { } define void @shuffle_v64i8_to_v8i8_2(ptr %L, ptr %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, 32(%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, 32(%rdi), %zmm0, %zmm0 -; AVX512VL-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrld $16, (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_2: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $16, (%rdi), %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, ptr %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, ptr %S @@ -302,35 +190,12 @@ define void @shuffle_v64i8_to_v8i8_2(ptr %L, ptr %S) nounwind { } define void @shuffle_v64i8_to_v8i8_3(ptr %L, ptr %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrld $24, (%rdi), %zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $24, (%rdi), %zmm0 -; AVX512VL-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrld $24, (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_3: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $24, (%rdi), %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, ptr %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, ptr %S @@ -340,7 +205,7 @@ define void @shuffle_v64i8_to_v8i8_3(ptr %L, ptr %S) nounwind { define void @shuffle_v64i8_to_v8i8_4(ptr %L, ptr %S) nounwind { ; AVX512-LABEL: shuffle_v64i8_to_v8i8_4: ; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512-NEXT: vpsrlq $32, (%rdi), %zmm0 ; AVX512-NEXT: vpmovqb %zmm0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -351,35 +216,12 @@ define void @shuffle_v64i8_to_v8i8_4(ptr %L, ptr %S) nounwind { } define void @shuffle_v64i8_to_v8i8_5(ptr %L, ptr %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlq $40, (%rdi), %zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlq $40, (%rdi), %zmm0 -; AVX512VL-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlq $40, (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_5: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $40, (%rdi), %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, ptr %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, ptr %S @@ -387,39 +229,12 @@ define void @shuffle_v64i8_to_v8i8_5(ptr %L, ptr %S) nounwind { } define void @shuffle_v64i8_to_v8i8_6(ptr %L, ptr %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, 32(%rdi), %zmm0, %zmm0 -; AVX512F-NEXT: vpsrlq $48, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, 32(%rdi), %zmm0, %zmm0 -; AVX512VL-NEXT: vpsrlq $48, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlq $48, (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_6: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $48, (%rdi), %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, ptr %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, ptr %S @@ -427,38 +242,17 @@ define void @shuffle_v64i8_to_v8i8_6(ptr %L, ptr %S) nounwind { } define void @shuffle_v64i8_to_v8i8_7(ptr %L, ptr %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlq $56, (%rdi), %zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlq $56, (%rdi), %zmm0 -; AVX512VL-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlq $56, (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v64i8_to_v8i8_7: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $56, (%rdi), %zmm0 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <64 x i8>, ptr %L %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> store <8 x i8> %strided.vec, ptr %S ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512VL: {{.*}} diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index bbf9db6..9a1d3ad 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -274,53 +274,12 @@ define void @trunc_v8i64_to_v8i8(ptr %L, ptr %S) nounwind { } define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) { -; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm1 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm1 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VBMI-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VBMI-NEXT: vzeroupper -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512VBMIVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq +; AVX512-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $8, %zmm0, %zmm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> ret <16 x i8> %res } @@ -333,7 +292,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrld $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-NEXT: vzeroupper @@ -346,7 +305,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] ; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrld $8, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512VL-NEXT: vzeroupper @@ -359,7 +318,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BW-NEXT: vzeroupper @@ -372,7 +331,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] ; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpsrld $8, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BWVL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index dcaac8d..2c8d657 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -251,8 +251,8 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i16_stride4_vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpsrld $16, %ymm0, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1 +; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm2 ; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm3 ; AVX512-NEXT: vpmovqw %ymm0, (%rsi) ; AVX512-NEXT: vpmovqw %ymm1, (%rdx) @@ -516,31 +516,18 @@ define void @load_i16_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; -; AVX512F-LABEL: load_i16_stride4_vf8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrld $16, (%rdi), %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vpshufd {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; AVX512F-NEXT: vpsrlq $48, (%rdi), %zmm3 -; AVX512F-NEXT: vpmovqw %zmm1, (%rsi) -; AVX512F-NEXT: vpmovqw %zmm0, (%rdx) -; AVX512F-NEXT: vpmovqw %zmm2, (%rcx) -; AVX512F-NEXT: vpmovqw %zmm3, (%r8) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: load_i16_stride4_vf8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovqw %zmm0, (%rsi) -; AVX512BW-NEXT: vpmovqw %zmm1, (%rdx) -; AVX512BW-NEXT: vpmovqw %zmm2, (%rcx) -; AVX512BW-NEXT: vpmovqw %zmm3, (%r8) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: load_i16_stride4_vf8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm1 +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm2 +; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm3 +; AVX512-NEXT: vpmovqw %zmm0, (%rsi) +; AVX512-NEXT: vpmovqw %zmm1, (%rdx) +; AVX512-NEXT: vpmovqw %zmm2, (%rcx) +; AVX512-NEXT: vpmovqw %zmm3, (%r8) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %wide.vec = load <32 x i16>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> %strided.vec1 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> @@ -1059,7 +1046,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm3, %zmm5 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm3, %zmm5 ; AVX512F-SLOW-NEXT: vpmovqw %zmm5, %xmm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] @@ -1075,7 +1062,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm8 = zmm3[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm3, %zmm8 ; AVX512F-SLOW-NEXT: vpmovqw %zmm8, %xmm8 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] @@ -1118,7 +1105,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrld $16, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm4, %zmm3 ; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7] @@ -1127,7 +1114,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermd %ymm6, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm5 ; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm8, %ymm5 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm0 = zmm4[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm0 ; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm2 @@ -2321,7 +2308,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm1, %zmm9 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm9 ; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 @@ -2336,7 +2323,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm0, %zmm9 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm9 ; AVX512F-SLOW-NEXT: vpmovqw %zmm9, %xmm9 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7] @@ -2353,7 +2340,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm13 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm1, %zmm13 ; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 @@ -2370,7 +2357,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm14 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[4,5,6,7] @@ -2440,14 +2427,14 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm9, %ymm9 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-FAST-NEXT: vpsrld $16, %zmm1, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm1, %zmm13 ; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 ; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm12 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-FAST-NEXT: vpsrld $16, %zmm0, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm0, %zmm13 ; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm12[0,1,2,3],zmm9[4,5,6,7] @@ -2457,7 +2444,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermd %ymm8, %ymm12, %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm8, %ymm15 ; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm11, %ymm15 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm13 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm1, %zmm13 ; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm13 @@ -2466,7 +2453,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm12, %ymm12 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm3 ; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm11, %ymm3 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm13[4,5,6,7] @@ -5104,13 +5091,13 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti128 $1, (%rsp), %ymm0, %ymm13 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm0, %zmm13 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm0, %zmm13 ; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm1, %zmm14 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm1, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 @@ -5118,13 +5105,13 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm2, %zmm13 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm2, %zmm13 ; AVX512F-SLOW-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm21, %zmm14 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm21, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 @@ -5136,7 +5123,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm12 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm14 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm0, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12 @@ -5147,7 +5134,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm10 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm14 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm1, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm10[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7] @@ -5158,7 +5145,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm14 = zmm2[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm2, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8 @@ -5169,7 +5156,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm14 ; AVX512F-SLOW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm14 = zmm21[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm14 ; AVX512F-SLOW-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7] ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[4,5,6,7] @@ -5283,28 +5270,28 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u> ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm11, %ymm11 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7] -; AVX512F-FAST-NEXT: vpsrld $16, %zmm4, %zmm14 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm4, %zmm14 ; AVX512F-FAST-NEXT: vpmovqw %zmm14, %xmm14 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrld $16, %zmm30, %zmm8 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm30, %zmm8 ; AVX512F-FAST-NEXT: vpmovqw %zmm8, %xmm8 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm11[4,5,6,7] ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm12, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpsrld $16, %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm26, %zmm3 ; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpsrld $16, %zmm23, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm23, %zmm3 ; AVX512F-FAST-NEXT: vpmovqw %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm1[0,1,2,3],zmm0[4,5,6,7] @@ -5314,7 +5301,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermd %ymm25, %ymm15, %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm8, %ymm1 ; AVX512F-FAST-NEXT: vpermt2d %ymm0, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm0 = zmm4[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm4, %zmm0 ; AVX512F-FAST-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 @@ -5323,7 +5310,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm15, %ymm12 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm14 ; AVX512F-FAST-NEXT: vpermt2d %ymm13, %ymm7, %ymm14 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm13 = zmm30[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm30, %zmm13 ; AVX512F-FAST-NEXT: vpmovqw %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm13[0,1,2,3],zmm1[4,5,6,7] @@ -5332,7 +5319,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm15, %ymm14 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm14, %ymm11 ; AVX512F-FAST-NEXT: vpermt2d %ymm1, %ymm7, %ymm11 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm1 = zmm26[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm26, %zmm1 ; AVX512F-FAST-NEXT: vpmovqw %zmm1, %xmm1 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 @@ -5341,7 +5328,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm15, %ymm5 ; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm6 ; AVX512F-FAST-NEXT: vpermt2d %ymm2, %ymm7, %ymm6 -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm2 = zmm23[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm23, %zmm2 ; AVX512F-FAST-NEXT: vpmovqw %zmm2, %xmm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm11[4,5,6,7] @@ -5466,6 +5453,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} +; AVX512F: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index 7cb62c1..042c6d2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -245,7 +245,7 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-LABEL: load_i8_stride4_vf8: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm1 +; AVX512-NEXT: vpsrld $8, %ymm0, %ymm1 ; AVX512-NEXT: vpsrld $16, %ymm0, %ymm2 ; AVX512-NEXT: vpsrld $24, %ymm0, %ymm3 ; AVX512-NEXT: vpmovdb %ymm0, (%rsi) @@ -475,35 +475,18 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%r8) ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: load_i8_stride4_vf16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-NEXT: vpsrld $16, %zmm1, %zmm2 -; AVX512F-NEXT: vpsrld $24, %zmm1, %zmm3 -; AVX512F-NEXT: vpmovdb %zmm1, (%rsi) -; AVX512F-NEXT: vpmovdb %zmm0, (%rdx) -; AVX512F-NEXT: vpmovdb %zmm2, (%rcx) -; AVX512F-NEXT: vpmovdb %zmm3, (%r8) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: load_i8_stride4_vf16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsrld $24, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512BW-NEXT: vpmovdb %zmm1, (%rdx) -; AVX512BW-NEXT: vpmovdb %zmm2, (%rcx) -; AVX512BW-NEXT: vpmovdb %zmm3, (%r8) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: load_i8_stride4_vf16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpsrld $8, %zmm0, %zmm1 +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm2 +; AVX512-NEXT: vpsrld $24, %zmm0, %zmm3 +; AVX512-NEXT: vpmovdb %zmm0, (%rsi) +; AVX512-NEXT: vpmovdb %zmm1, (%rdx) +; AVX512-NEXT: vpmovdb %zmm2, (%rcx) +; AVX512-NEXT: vpmovdb %zmm3, (%r8) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %wide.vec = load <64 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <16 x i32> %strided.vec1 = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <16 x i32> @@ -935,45 +918,41 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-LABEL: load_i8_stride4_vf32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm5 -; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,4,0,4,0,4,8,12] -; AVX512F-NEXT: vpermt2d %ymm5, %ymm6, %ymm1 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vpmovdb %zmm5, %xmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %ymm7, %ymm4, %ymm8 -; AVX512F-NEXT: vpshufb %ymm7, %ymm0, %ymm7 -; AVX512F-NEXT: vpermt2d %ymm8, %ymm6, %ymm7 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm7 -; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm3 -; AVX512F-NEXT: vpermt2d %ymm7, %ymm6, %ymm3 -; AVX512F-NEXT: vpsrld $16, %zmm5, %zmm7 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512F-NEXT: vpshufb %ymm0, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,0,4,0,4,8,12] +; AVX512F-NEXT: vpermt2d %ymm2, %ymm4, %ymm0 +; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm5 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %ymm5, %ymm1, %ymm6 +; AVX512F-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512F-NEXT: vpermt2d %ymm6, %ymm4, %ymm5 +; AVX512F-NEXT: vpsrld $8, %zmm2, %zmm6 +; AVX512F-NEXT: vpmovdb %zmm6, %xmm6 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %ymm6, %ymm1, %ymm7 +; AVX512F-NEXT: vpshufb %ymm6, %ymm3, %ymm6 +; AVX512F-NEXT: vpermt2d %ymm7, %ymm4, %ymm6 +; AVX512F-NEXT: vpsrld $16, %zmm2, %zmm7 ; AVX512F-NEXT: vpmovdb %zmm7, %xmm7 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512F-NEXT: vpermt2d %ymm4, %ymm6, %ymm0 -; AVX512F-NEXT: vpsrld $24, %zmm5, %zmm4 -; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512F-NEXT: vmovdqa %ymm2, (%rdx) -; AVX512F-NEXT: vmovdqa %ymm3, (%rcx) -; AVX512F-NEXT: vmovdqa %ymm0, (%r8) +; AVX512F-NEXT: vpshufb %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpermt2d %ymm1, %ymm4, %ymm3 +; AVX512F-NEXT: vpsrld $24, %zmm2, %zmm1 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512F-NEXT: vmovdqa %ymm5, (%rdx) +; AVX512F-NEXT: vmovdqa %ymm6, (%rcx) +; AVX512F-NEXT: vmovdqa %ymm1, (%r8) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1937,47 +1916,39 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX512F-NEXT: vpshufb %ymm6, %ymm3, %ymm5 +; AVX512F-NEXT: vpshufb %ymm7, %ymm3, %ymm5 ; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb %ymm6, %ymm4, %ymm7 +; AVX512F-NEXT: vpshufb %ymm7, %ymm4, %ymm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,0,4,0,4,8,12] -; AVX512F-NEXT: vpermt2d %ymm5, %ymm1, %ymm7 +; AVX512F-NEXT: vpermt2d %ymm5, %ymm1, %ymm6 ; AVX512F-NEXT: vpmovdb %zmm2, %xmm5 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm9 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512F-NEXT: vpshufb %ymm6, %ymm7, %ymm11 -; AVX512F-NEXT: vpshufb %ymm6, %ymm5, %ymm6 -; AVX512F-NEXT: vpermt2d %ymm11, %ymm1, %ymm6 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm9[4,5,6,7] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512F-NEXT: vpshufb %ymm9, %ymm4, %ymm12 -; AVX512F-NEXT: vpermt2d %ymm11, %ymm1, %ymm12 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512F-NEXT: vpsrlw $8, %ymm11, %ymm11 -; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX512F-NEXT: vpsrlw $8, %ymm13, %ymm13 -; AVX512F-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 -; AVX512F-NEXT: vpmovdb %zmm11, %xmm11 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11 -; AVX512F-NEXT: vpshufb %ymm9, %ymm7, %ymm12 -; AVX512F-NEXT: vpshufb %ymm9, %ymm5, %ymm9 -; AVX512F-NEXT: vpermt2d %ymm12, %ymm1, %ymm9 -; AVX512F-NEXT: vpsrlw $8, %ymm10, %ymm10 -; AVX512F-NEXT: vpsrlw $8, %ymm8, %ymm8 -; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 -; AVX512F-NEXT: vpmovdb %zmm8, %xmm8 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm11[4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm8 +; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX512F-NEXT: vpshufb %ymm7, %ymm5, %ymm9 +; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX512F-NEXT: vpshufb %ymm7, %ymm6, %ymm7 +; AVX512F-NEXT: vpermt2d %ymm9, %ymm1, %ymm7 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,2,3],zmm8[4,5,6,7] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-NEXT: vpshufb %ymm8, %ymm3, %ymm9 +; AVX512F-NEXT: vpshufb %ymm8, %ymm4, %ymm10 +; AVX512F-NEXT: vpermt2d %ymm9, %ymm1, %ymm10 +; AVX512F-NEXT: vpsrld $8, %zmm2, %zmm9 +; AVX512F-NEXT: vpmovdb %zmm9, %xmm9 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 +; AVX512F-NEXT: vpshufb %ymm8, %ymm5, %ymm10 +; AVX512F-NEXT: vpshufb %ymm8, %ymm6, %ymm8 +; AVX512F-NEXT: vpermt2d %ymm10, %ymm1, %ymm8 +; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm10 +; AVX512F-NEXT: vpmovdb %zmm10, %xmm10 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] +; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm9[4,5,6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %ymm9, %ymm3, %ymm10 ; AVX512F-NEXT: vpshufb %ymm9, %ymm4, %ymm11 @@ -1986,8 +1957,8 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpmovdb %zmm10, %xmm10 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 -; AVX512F-NEXT: vpshufb %ymm9, %ymm7, %ymm11 -; AVX512F-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX512F-NEXT: vpshufb %ymm9, %ymm5, %ymm11 +; AVX512F-NEXT: vpshufb %ymm9, %ymm6, %ymm9 ; AVX512F-NEXT: vpermt2d %ymm11, %ymm1, %ymm9 ; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm11 ; AVX512F-NEXT: vpmovdb %zmm11, %xmm11 @@ -2001,14 +1972,14 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 -; AVX512F-NEXT: vpshufb %ymm10, %ymm7, %ymm3 -; AVX512F-NEXT: vpshufb %ymm10, %ymm5, %ymm4 +; AVX512F-NEXT: vpshufb %ymm10, %ymm5, %ymm3 +; AVX512F-NEXT: vpshufb %ymm10, %ymm6, %ymm4 ; AVX512F-NEXT: vpermt2d %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] -; AVX512F-NEXT: vmovdqa64 %zmm6, (%rsi) +; AVX512F-NEXT: vmovdqa64 %zmm7, (%rsi) ; AVX512F-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512F-NEXT: vmovdqa64 %zmm9, (%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm0, (%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 455b77c..abcd453 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -290,10 +290,10 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpsrlw $8, %ymm0, %ymm1 -; AVX512-NEXT: vpsrld $16, %ymm0, %ymm2 -; AVX512-NEXT: vpsrld $24, %ymm0, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] +; AVX512-NEXT: vpsrlq $8, %ymm0, %ymm1 +; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm2 +; AVX512-NEXT: vpsrlq $24, %ymm0, %ymm3 +; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm4 ; AVX512-NEXT: vpsrlq $40, %ymm0, %ymm5 ; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm6 ; AVX512-NEXT: vpsrlq $56, %ymm0, %ymm7 @@ -720,67 +720,29 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rax) ; AVX2-ONLY-NEXT: retq ; -; AVX512F-LABEL: load_i8_stride8_vf8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm1 -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm3 -; AVX512F-NEXT: vpsrld $16, %zmm3, %zmm4 -; AVX512F-NEXT: vpmovqb %zmm4, %xmm4 -; AVX512F-NEXT: vpsrld $24, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512F-NEXT: vpsrlq $40, %zmm5, %zmm6 -; AVX512F-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512F-NEXT: vpsrlq $48, %zmm5, %zmm7 -; AVX512F-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512F-NEXT: vpsrlq $56, %zmm5, %zmm5 -; AVX512F-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512F-NEXT: vmovq %xmm1, (%rsi) -; AVX512F-NEXT: vmovq %xmm2, (%rdx) -; AVX512F-NEXT: vmovq %xmm4, (%rcx) -; AVX512F-NEXT: vmovq %xmm3, (%r8) -; AVX512F-NEXT: vmovq %xmm0, (%r9) -; AVX512F-NEXT: vmovq %xmm6, (%r11) -; AVX512F-NEXT: vmovq %xmm7, (%r10) -; AVX512F-NEXT: vmovq %xmm5, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: load_i8_stride8_vf8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsrld $24, %zmm0, %zmm3 -; AVX512BW-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; AVX512BW-NEXT: vpsrlq $40, %zmm0, %zmm5 -; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm6 -; AVX512BW-NEXT: vpsrlq $56, %zmm0, %zmm7 -; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512BW-NEXT: vpmovqb %zmm1, (%rdx) -; AVX512BW-NEXT: vpmovqb %zmm2, (%rcx) -; AVX512BW-NEXT: vpmovqb %zmm3, (%r8) -; AVX512BW-NEXT: vpmovqb %zmm4, (%r9) -; AVX512BW-NEXT: vpmovqb %zmm5, (%r11) -; AVX512BW-NEXT: vpmovqb %zmm6, (%r10) -; AVX512BW-NEXT: vpmovqb %zmm7, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: load_i8_stride8_vf8: +; AVX512: # %bb.0: +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vpsrlq $8, %zmm0, %zmm1 +; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm2 +; AVX512-NEXT: vpsrlq $24, %zmm0, %zmm3 +; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm4 +; AVX512-NEXT: vpsrlq $40, %zmm0, %zmm5 +; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm6 +; AVX512-NEXT: vpsrlq $56, %zmm0, %zmm7 +; AVX512-NEXT: vpmovqb %zmm0, (%rsi) +; AVX512-NEXT: vpmovqb %zmm1, (%rdx) +; AVX512-NEXT: vpmovqb %zmm2, (%rcx) +; AVX512-NEXT: vpmovqb %zmm3, (%r8) +; AVX512-NEXT: vpmovqb %zmm4, (%r9) +; AVX512-NEXT: vpmovqb %zmm5, (%r11) +; AVX512-NEXT: vpmovqb %zmm6, (%r10) +; AVX512-NEXT: vpmovqb %zmm7, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %wide.vec = load <64 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> %strided.vec1 = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> @@ -5289,18 +5251,19 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm6 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] @@ -5312,239 +5275,239 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm8 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm6, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3] ; AVX512F-SLOW-NEXT: vpmovqb %zmm16, %xmm10 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm20 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa %xmm2, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa %xmm14, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7] -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512F-SLOW-NEXT: vpsrlw $8, %ymm13, %ymm13 -; AVX512F-SLOW-NEXT: vpsrlw $8, %ymm14, %ymm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm17 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm1, %xmm13 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vmovdqa %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm7, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm15, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm17, %xmm2 +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm20 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm21 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm5[5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm22 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm21 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm11 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm26 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3,4],ymm5[5],ymm12[6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5],ymm14[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm21 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrld $24, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm20 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm12 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm14 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa %xmm14, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm13, %xmm12 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm26 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm28 +; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm24 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm23 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm8, %xmm14 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm2 = zmm16[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm14 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm13 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm15 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm7, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm23 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm16, %zmm2 @@ -5552,39 +5515,43 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm27 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm25 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm13, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm15 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm9, %xmm7, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm16, %zmm2 @@ -5592,43 +5559,44 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm5, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm16, %zmm3 ; AVX512F-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, (%rsi) -; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rdx) +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm17, (%rsi) +; AVX512F-SLOW-NEXT: vmovdqa64 %ymm18, (%rdx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm19, (%rcx) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm20, (%r8) ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm21, (%r9) @@ -5644,226 +5612,227 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-LABEL: load_i8_stride8_vf32: ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm18 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa64 224(%rdi), %ymm16 ; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm0, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm1 ; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm0, %ymm10 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm0, %ymm6 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm19 -; AVX512F-FAST-NEXT: vpermd %ymm19, %ymm0, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm13 +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm9 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vmovdqa64 128(%rdi), %ymm20 -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX512F-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm11 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512F-FAST-NEXT: vpmovqb %zmm18, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512F-FAST-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpsrlw $8, %ymm6, %ymm6 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm8 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm13 -; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] -; AVX512F-FAST-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] +; AVX512F-FAST-NEXT: vpmovqb %zmm18, %xmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm7 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] +; AVX512F-FAST-NEXT: vpsrlq $8, %zmm18, %zmm8 +; AVX512F-FAST-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm6, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm30 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm28 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm8 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm26 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm12, %xmm10 ; AVX512F-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512F-FAST-NEXT: vpsrld $16, %zmm18, %zmm8 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm18, %zmm8 ; AVX512F-FAST-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm25 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm10, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm6, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm9, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm11, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm26 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm11, %ymm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm12, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] -; AVX512F-FAST-NEXT: vpsrld $24, %zmm18, %zmm7 -; AVX512F-FAST-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm5, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX512F-FAST-NEXT: vpsrlq $24, %zmm18, %zmm6 +; AVX512F-FAST-NEXT: vpmovqb %zmm6, %xmm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512F-FAST-NEXT: vpermd %ymm16, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm11, %ymm7 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm4 ; AVX512F-FAST-NEXT: vpermd %ymm17, %ymm0, %ymm9 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-FAST-NEXT: vpermd %ymm19, %ymm0, %ymm13 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpermd %ymm20, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm13 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-FAST-NEXT: vmovdqa %xmm2, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm2, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm14 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm12, %xmm0 ; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm10 = zmm18[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; AVX512F-FAST-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm13, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm7 -; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] -; AVX512F-FAST-NEXT: vpsrlq $40, %zmm18, %zmm7 -; AVX512F-FAST-NEXT: vpmovqb %zmm7, %xmm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm18, %zmm14 +; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29] +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] +; AVX512F-FAST-NEXT: vpsrlq $40, %zmm18, %zmm4 +; AVX512F-FAST-NEXT: vpmovqb %zmm4, %xmm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm9, %ymm7 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm13, %ymm7 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4],ymm7[5],ymm10[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm3, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm10[0,1,2],xmm7[3] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm18, %zmm10 -; AVX512F-FAST-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,23,27,31] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm9 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm9[5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm12, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm12, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm14 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm18, %zmm2 +; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm6, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] ; AVX512F-FAST-NEXT: vpsrlq $56, %zmm18, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, (%rsi) -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, (%rdx) -; AVX512F-FAST-NEXT: vmovdqa %ymm8, (%rcx) -; AVX512F-FAST-NEXT: vmovdqa %ymm4, (%r8) +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, (%rsi) +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, (%rdx) +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, (%rcx) +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, (%r8) ; AVX512F-FAST-NEXT: vmovdqa %ymm15, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa %ymm6, (%rax) +; AVX512F-FAST-NEXT: vmovdqa %ymm10, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax @@ -5949,7 +5918,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb %xmm14, %xmm8, %xmm14 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrlw $8, %zmm16, %zmm10 +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm16, %zmm10 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -5984,7 +5953,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrld $16, %zmm16, %zmm10 +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm16, %zmm10 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -6019,7 +5988,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrld $24, %zmm16, %zmm10 +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm16, %zmm10 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -6054,7 +6023,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb %xmm10, %xmm8, %xmm10 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} zmm10 = zmm16[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm16, %zmm10 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -6236,7 +6205,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm8 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm20[0],xmm8[1],xmm20[1],xmm8[2],xmm20[2],xmm8[3],xmm20[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3] -; AVX512BW-FAST-NEXT: vpsrlw $8, %zmm0, %zmm8 +; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm0, %zmm8 ; AVX512BW-FAST-NEXT: vpmovqb %zmm8, %xmm8 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6261,7 +6230,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm24[0],xmm10[1],xmm24[1],xmm10[2],xmm24[2],xmm10[3],xmm24[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512BW-FAST-NEXT: vpsrld $16, %zmm0, %zmm10 +; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm0, %zmm10 ; AVX512BW-FAST-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] @@ -6285,7 +6254,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm10, %xmm5, %xmm10 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm28[0],xmm10[1],xmm28[1],xmm10[2],xmm28[2],xmm10[3],xmm28[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] -; AVX512BW-FAST-NEXT: vpsrld $24, %zmm0, %zmm10 +; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm0, %zmm10 ; AVX512BW-FAST-NEXT: vpmovqb %zmm10, %xmm10 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] @@ -6310,7 +6279,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm5, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vpshufd {{.*#+}} zmm13 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm0, %zmm13 ; AVX512BW-FAST-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] @@ -11616,27 +11585,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512F-SLOW-LABEL: load_i8_stride8_vf64: ; AVX512F-SLOW: # %bb.0: -; AVX512F-SLOW-NEXT: subq $552, %rsp # imm = 0x228 -; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512F-SLOW-NEXT: vmovdqa64 256(%rdi), %zmm27 ; AVX512F-SLOW-NEXT: vmovdqa64 384(%rdi), %zmm0 ; AVX512F-SLOW-NEXT: vpmovqb %zmm0, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa 496(%rdi), %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm3, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa 480(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa 464(%rdi), %xmm13 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 448(%rdi), %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] @@ -11645,461 +11610,444 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpmovqb %ymm4, %xmm4 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm15 -; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm21 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512F-SLOW-NEXT: vmovdqa 352(%rdi), %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa 336(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm19 +; AVX512F-SLOW-NEXT: vmovdqa 320(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; AVX512F-SLOW-NEXT: vpmovqb %zmm27, %xmm4 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: movb $-64, %al ; AVX512F-SLOW-NEXT: kmovw %eax, %k1 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 {%k1} +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1} ; AVX512F-SLOW-NEXT: vmovdqa 240(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa %xmm4, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm22 ; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa 224(%rdi), %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm24 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa 208(%rdi), %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm20 -; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512F-SLOW-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-SLOW-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 ; AVX512F-SLOW-NEXT: vmovdqa 192(%rdi), %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512F-SLOW-NEXT: vpmovqb %zmm5, %xmm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512F-SLOW-NEXT: vpmovqb %ymm5, %xmm5 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX512F-SLOW-NEXT: vmovdqa 112(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm20 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX512F-SLOW-NEXT: vmovdqa 80(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm30 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm4 +; AVX512F-SLOW-NEXT: vpmovqb %zmm4, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm4, %zmm18 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm15, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm26 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm27 -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm31 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm28 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa %xmm15, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm20, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm14 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm14 {%k1} -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm9 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm18 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm7, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm24 -; AVX512F-SLOW-NEXT: vpshufb %xmm11, %xmm3, %xmm11 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm3, %xmm20 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm11, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm9 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm7 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm10 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm27, %zmm7 +; AVX512F-SLOW-NEXT: vpmovqb %zmm7, %xmm7 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm19 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm19 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm12, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm7 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7] +; AVX512F-SLOW-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 176(%rdi), %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] +; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm9 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpmovqb %zmm19, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm14, %zmm0 -; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm7, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm22 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm10 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm16, %zmm10 -; AVX512F-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm21 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm19 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm15 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm23 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpsrld $16, %zmm24, %zmm2 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512F-SLOW-NEXT: vpsrlq $8, %zmm18, %zmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm18, %zmm21 ; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm15, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm24 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm17 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vmovdqa %xmm13, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm19 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm11 -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4],ymm4[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] -; AVX512F-SLOW-NEXT: vpsrld $24, %zmm16, %zmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512F-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm26 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm19 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm18 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm15, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm27, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm11, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm13 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX512F-SLOW-NEXT: vpsrlq $16, %zmm21, %zmm2 +; AVX512F-SLOW-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 +; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm12 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm29 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm14, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm22 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm27, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpsrld $24, %zmm24, %zmm1 -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, %zmm29 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512F-SLOW-NEXT: vpsrlq $24, %zmm21, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm11 +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm24 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm24 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm16 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm27, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm10 = zmm20[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; AVX512F-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm25 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm9, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm21 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm15, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm23 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm1 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm14 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} zmm1 = zmm29[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX512F-SLOW-NEXT: vpsrlq $32, %zmm21, %zmm1 +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -12108,7 +12056,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 @@ -12117,82 +12065,79 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm27 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm25 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm20, %zmm10 -; AVX512F-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm31 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm30 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm26 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm27, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm4 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm28 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm30 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm12, %xmm15 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm31 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm21 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm19 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm12 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm23, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm11 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm15, %xmm18 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm14, %xmm20 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm29, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $40, %zmm21, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm23 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm23 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 @@ -12200,8 +12145,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm21 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -12209,80 +12155,78 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm28 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm26 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5],ymm10[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm5 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm16 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm13 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm14 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm20, %zmm10 -; AVX512F-SLOW-NEXT: vpmovqb %zmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm15, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm9, %xmm17 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm27, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm13 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm10 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm14 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm22 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm9 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm15 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm14 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm10 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm25 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm8, %xmm19 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm19 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm2 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm24 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm7, %xmm29 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm20, %xmm10 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm6, %xmm17 -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm29, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $48, %zmm22, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm24 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm20 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm1, %xmm1 @@ -12290,7 +12234,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm4 @@ -12299,72 +12243,73 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm3 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm13, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm15, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm11, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm20, %zmm6 -; AVX512F-SLOW-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5 -; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm5 {%k1} -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm27, %zmm8 +; AVX512F-SLOW-NEXT: vpmovqb %zmm8, %xmm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] +; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6 +; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm6 {%k1} +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm4 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm13, %xmm6 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm7 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm8 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm9 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm6 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,6],ymm4[7] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm8 +; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 +; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm12, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm10, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm0 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX512F-SLOW-NEXT: vmovdqa64 %xmm17, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm14, %xmm3 +; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm29, %zmm1 +; AVX512F-SLOW-NEXT: vpsrlq $56, %zmm22, %zmm1 ; AVX512F-SLOW-NEXT: vpmovqb %zmm1, %xmm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0 +; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0 ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -12378,304 +12323,305 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm23, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-SLOW-NEXT: vmovdqa64 %zmm24, (%rax) +; AVX512F-SLOW-NEXT: vmovdqa64 %zmm20, (%rax) ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-SLOW-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512F-SLOW-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512F-SLOW-NEXT: vzeroupper ; AVX512F-SLOW-NEXT: retq ; ; AVX512F-FAST-LABEL: load_i8_stride8_vf64: ; AVX512F-FAST: # %bb.0: -; AVX512F-FAST-NEXT: subq $440, %rsp # imm = 0x1B8 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,2,3,0,2,4,6] +; AVX512F-FAST-NEXT: subq $408, %rsp # imm = 0x198 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6] ; AVX512F-FAST-NEXT: vmovdqa 480(%rdi), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm5 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX512F-FAST-NEXT: vmovdqa 448(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-FAST-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vmovdqa 416(%rdi), %ymm3 ; AVX512F-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm13 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm13, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm14, %xmm3 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX512F-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm1, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512F-FAST-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpmovqb %zmm1, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17 +; AVX512F-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vmovdqa64 384(%rdi), %ymm31 +; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm1, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm23 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vmovdqa 368(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512F-FAST-NEXT: vmovdqa 352(%rdi), %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-FAST-NEXT: vmovdqa 336(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm27 +; AVX512F-FAST-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm8 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa64 256(%rdi), %zmm26 +; AVX512F-FAST-NEXT: vpmovqb %zmm26, %xmm7 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16 ; AVX512F-FAST-NEXT: movb $-64, %al ; AVX512F-FAST-NEXT: kmovw %eax, %k1 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17 {%k1} -; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512F-FAST-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23 -; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-FAST-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24 -; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512F-FAST-NEXT: vmovdqa64 192(%rdi), %ymm29 -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm25 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqa64 160(%rdi), %ymm28 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1} +; AVX512F-FAST-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512F-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX512F-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512F-FAST-NEXT: vmovdqa 128(%rdi), %ymm2 ; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm8, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm10 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm8 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm26 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm14, %ymm7 +; AVX512F-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm9 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX512F-FAST-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm0, %xmm8 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm15 -; AVX512F-FAST-NEXT: vmovdqa %xmm3, %xmm8 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm31 -; AVX512F-FAST-NEXT: vpmovqb %zmm31, %xmm1 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm7 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm29 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512F-FAST-NEXT: vmovdqa 80(%rdi), %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm0 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm18 +; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm11, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm16 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm10 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm30 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm7 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm18 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm11 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm24 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm3 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm15 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm16 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm0 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm11, %xmm14 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpmovqb %zmm23, %xmm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm13, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm21 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm13, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm6 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm23 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] +; AVX512F-FAST-NEXT: vpsrlq $8, %zmm26, %zmm3 +; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm3 ; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm14, %ymm3 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm11 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm11, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm14, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm7 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm27, %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm6 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm29, %xmm17 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm14 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm14, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm15, %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpmovqb %zmm24, %xmm3 +; AVX512F-FAST-NEXT: vpsrlq $8, %zmm28, %zmm3 +; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm5, %ymm27 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm24 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm23 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm11, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm8, %ymm27 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm3 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm6, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm15 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload -; AVX512F-FAST-NEXT: vpsrld $16, %zmm20, %zmm14 -; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm19 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm26, %zmm5 +; AVX512F-FAST-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm12 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm15 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm14 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm18 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm7, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm7, %ymm29 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm10 ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3] -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm18 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm16 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm20 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm9 +; AVX512F-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrld $16, %zmm31, %zmm3 +; AVX512F-FAST-NEXT: vpsrlq $16, %zmm28, %zmm3 +; AVX512F-FAST-NEXT: vmovdqa64 %zmm28, %zmm23 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa %ymm1, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm13, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm15 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm17 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm22 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm12, %xmm3 +; AVX512F-FAST-NEXT: vmovdqa %xmm12, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm14 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm14, %xmm12 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm7 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpsrld $24, %zmm20, %zmm12 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm20, %zmm24 -; AVX512F-FAST-NEXT: vpmovqb %zmm12, %xmm12 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm12 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm13, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] +; AVX512F-FAST-NEXT: vpsrlq $24, %zmm26, %zmm5 +; AVX512F-FAST-NEXT: vpmovqb %zmm5, %xmm5 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm9 -; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm11, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm7, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm17 ; AVX512F-FAST-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm22 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm10, %xmm21 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm5 ; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm4 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm23 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm25 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX512F-FAST-NEXT: vpsrld $24, %zmm31, %zmm3 -; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, %zmm27 +; AVX512F-FAST-NEXT: vpsrlq $24, %zmm28, %zmm3 ; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -12683,256 +12629,256 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm12, %ymm2 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm4 ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm6 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm11, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5],ymm6[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm18 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm6 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm14, %xmm6 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm14, %xmm17 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm31 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm6 = zmm20[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; AVX512F-FAST-NEXT: vpmovqb %zmm6, %xmm6 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm14 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm14 {%k1} +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] ; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512F-FAST-NEXT: vpermd %ymm29, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm20 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpermd %ymm28, %ymm0, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa %ymm2, %ymm10 -; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm30 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm7 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm11 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm13 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] -; AVX512F-FAST-NEXT: vpshufd {{.*#+}} zmm11 = zmm27[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm28 +; AVX512F-FAST-NEXT: vpermd %ymm31, %ymm0, %ymm2 +; AVX512F-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm31 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm8 +; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm8 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm18 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm15, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm15, %xmm24 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm12, %xmm10 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm22 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm13, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm13, %xmm16 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3] +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm26, %zmm10 +; AVX512F-FAST-NEXT: vpmovqb %zmm10, %xmm10 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm13 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm27 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm1[7] +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm15 +; AVX512F-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX512F-FAST-NEXT: vmovdqa64 %ymm31, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm1, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm30 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm9, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm7 +; AVX512F-FAST-NEXT: vpshufb %xmm6, %xmm7, %xmm6 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm20 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] +; AVX512F-FAST-NEXT: vmovdqa64 %zmm23, %zmm19 +; AVX512F-FAST-NEXT: vpsrlq $32, %zmm23, %zmm11 ; AVX512F-FAST-NEXT: vpmovqb %zmm11, %xmm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm11[0,1],xmm5[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm12, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm11 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm11 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm4, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm4, %xmm28 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm4, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm21 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm28, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-FAST-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm10 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm2 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm15 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm8 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $40, %zmm24, %zmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512F-FAST-NEXT: vpsrlq $40, %zmm26, %zmm14 ; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm4, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vpshufb %ymm9, %ymm10, %ymm14 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm10, %ymm29 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm9 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm9, %ymm15 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm6, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm14 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm14, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm30 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm8, %xmm31 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm16 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm17 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm9, %xmm21 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm11 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm7, %xmm23 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm9 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] -; AVX512F-FAST-NEXT: vpsrlq $40, %zmm27, %zmm13 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] +; AVX512F-FAST-NEXT: vpsrlq $40, %zmm19, %zmm13 ; AVX512F-FAST-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm22 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm31 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm12, %ymm5 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm26, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm13 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm22 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm4 +; AVX512F-FAST-NEXT: vpshufb %ymm4, %ymm0, %ymm11 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm17 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm12 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm4, %ymm12 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm12 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm5, %xmm16 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm13 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm28 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm2, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm6, %xmm15 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm6, %xmm25 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm8, %xmm15 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm24, %zmm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm26, %zmm14 ; AVX512F-FAST-NEXT: vpmovqb %zmm14, %xmm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512F-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm14 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm6 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm14 -; AVX512F-FAST-NEXT: vpshufb %ymm10, %ymm9, %ymm15 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm12 {%k1} +; AVX512F-FAST-NEXT: vmovdqa64 %ymm29, %ymm10 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm6 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm14 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5,6],ymm6[7] +; AVX512F-FAST-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm14 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm8 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm5 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] -; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm9 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3,4,5],ymm6[6,7] +; AVX512F-FAST-NEXT: vmovdqa64 %xmm21, %xmm15 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm14 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 +; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm11 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] ; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm14 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm10 -; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm10, %xmm13 +; AVX512F-FAST-NEXT: vpshufb %xmm13, %xmm7, %xmm13 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] -; AVX512F-FAST-NEXT: vpsrlq $48, %zmm27, %zmm13 -; AVX512F-FAST-NEXT: vpmovqb %zmm13, %xmm13 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm16 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm5 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm11 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5],ymm11[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm11 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm12 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm13 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm25, %xmm0 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm14 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512F-FAST-NEXT: vpsrlq $56, %zmm24, %zmm13 +; AVX512F-FAST-NEXT: vpsrlq $48, %zmm19, %zmm13 ; AVX512F-FAST-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm11 {%k1} -; AVX512F-FAST-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm4 -; AVX512F-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-FAST-NEXT: vpshufb %ymm15, %ymm6, %ymm2 -; AVX512F-FAST-NEXT: vmovdqa64 %ymm20, %ymm0 -; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm2 -; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm1, %xmm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm17 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm5 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm4 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm28, %xmm5 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm11, %xmm11 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm12 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX512F-FAST-NEXT: vpsrlq $56, %zmm26, %zmm11 +; AVX512F-FAST-NEXT: vpmovqb %zmm11, %xmm11 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4 +; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1} +; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm10, %ymm2 +; AVX512F-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm8 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7] +; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512F-FAST-NEXT: vpshufb %ymm14, %ymm0, %ymm1 +; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm0 +; AVX512F-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm15, %xmm1 +; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm9, %xmm2 +; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm3 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm3 -; AVX512F-FAST-NEXT: vpshufb %xmm12, %xmm10, %xmm4 -; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3] -; AVX512F-FAST-NEXT: vpsrlq $56, %zmm27, %zmm3 -; AVX512F-FAST-NEXT: vpmovqb %zmm3, %xmm3 -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 -; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vmovaps %zmm2, (%rsi) +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX512F-FAST-NEXT: vpsrlq $56, %zmm19, %zmm2 +; AVX512F-FAST-NEXT: vpmovqb %zmm2, %xmm2 +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] +; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0 +; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512F-FAST-NEXT: vmovaps %zmm1, (%rsi) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm1, (%rdx) ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload @@ -12942,12 +12888,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512F-FAST-NEXT: vmovaps %zmm1, (%r9) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm22, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm31, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512F-FAST-NEXT: vmovdqa64 %zmm16, (%rax) +; AVX512F-FAST-NEXT: vmovdqa64 %zmm17, (%rax) ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512F-FAST-NEXT: addq $440, %rsp # imm = 0x1B8 +; AVX512F-FAST-NEXT: addq $408, %rsp # imm = 0x198 ; AVX512F-FAST-NEXT: vzeroupper ; AVX512F-FAST-NEXT: retq ; @@ -13100,7 +13046,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm24 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm24[0],xmm4[0],xmm24[1],xmm4[1],xmm24[2],xmm4[2],xmm24[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpsrlw $8, %zmm6, %zmm4 +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm6, %zmm4 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm4, %xmm4 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 @@ -13146,7 +13092,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] ; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrlw $8, %zmm15, %zmm2 +; AVX512BW-SLOW-NEXT: vpsrlq $8, %zmm15, %zmm2 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -13195,7 +13141,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm26[0],xmm5[0],xmm26[1],xmm5[1],xmm26[2],xmm5[2],xmm26[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] ; AVX512BW-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512BW-SLOW-NEXT: vpsrld $16, %zmm13, %zmm5 +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm13, %zmm5 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 @@ -13234,7 +13180,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-SLOW-NEXT: vpsrld $16, %zmm15, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $16, %zmm15, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -13277,7 +13223,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm27, %xmm30 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX512BW-SLOW-NEXT: vpsrld $24, %zmm13, %zmm5 +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm13, %zmm5 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 @@ -13316,7 +13262,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb %xmm3, %xmm28, %xmm3 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpsrld $24, %zmm9, %zmm3 +; AVX512BW-SLOW-NEXT: vpsrlq $24, %zmm9, %zmm3 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm3, %xmm3 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -13358,7 +13304,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm30 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm30[0],xmm5[0],xmm30[1],xmm5[1],xmm30[2],xmm5[2],xmm30[3],xmm5[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} zmm5 = zmm13[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-SLOW-NEXT: vpsrlq $32, %zmm13, %zmm5 ; AVX512BW-SLOW-NEXT: vpmovqb %zmm5, %xmm5 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] ; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3 @@ -13392,8 +13338,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-SLOW-NEXT: vpshufb %xmm2, %xmm23, %xmm2 ; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] -; AVX512BW-SLOW-NEXT: vpshufd $245, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512BW-SLOW-NEXT: # zmm2 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-SLOW-NEXT: vpsrlq $32, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512BW-SLOW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -13744,7 +13689,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm29, %xmm21 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm21[0],xmm26[0],xmm21[1],xmm26[1],xmm21[2],xmm26[2],xmm21[3],xmm26[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] -; AVX512BW-FAST-NEXT: vpsrlw $8, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm4, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 @@ -13763,7 +13708,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm24, %xmm9, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3] -; AVX512BW-FAST-NEXT: vpsrlw $8, %zmm10, %zmm13 +; AVX512BW-FAST-NEXT: vpsrlq $8, %zmm10, %zmm13 ; AVX512BW-FAST-NEXT: vpmovqb %zmm13, %xmm13 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] @@ -13793,7 +13738,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm29, %xmm21 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm21[0],xmm15[0],xmm21[1],xmm15[1],xmm21[2],xmm15[2],xmm21[3],xmm15[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vpsrld $16, %zmm4, %zmm15 +; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm4, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 @@ -13812,7 +13757,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm13, %xmm9, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] -; AVX512BW-FAST-NEXT: vpsrld $16, %zmm10, %zmm11 +; AVX512BW-FAST-NEXT: vpsrlq $16, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -13840,7 +13785,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm29, %xmm15 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512BW-FAST-NEXT: vpsrld $24, %zmm4, %zmm14 +; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm4, %zmm14 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm4, %zmm17 ; AVX512BW-FAST-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] @@ -13860,7 +13805,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm3 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX512BW-FAST-NEXT: vpsrld $24, %zmm10, %zmm2 +; AVX512BW-FAST-NEXT: vpsrlq $24, %zmm10, %zmm2 ; AVX512BW-FAST-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -13892,7 +13837,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm29, %xmm28 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3] -; AVX512BW-FAST-NEXT: vpshufd {{.*#+}} zmm15 = zmm17[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm17, %zmm15 ; AVX512BW-FAST-NEXT: vpmovqb %zmm15, %xmm15 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] ; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12 @@ -13915,7 +13860,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshufb %xmm21, %xmm9, %xmm13 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3] -; AVX512BW-FAST-NEXT: vpshufd {{.*#+}} zmm11 = zmm10[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512BW-FAST-NEXT: vpsrlq $32, %zmm10, %zmm11 ; AVX512BW-FAST-NEXT: vpmovqb %zmm11, %xmm11 ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3] ; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] @@ -14091,12 +14036,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} ; AVX2: {{.*}} +; AVX512BW: {{.*}} ; AVX512BW-ONLY-FAST: {{.*}} ; AVX512BW-ONLY-SLOW: {{.*}} ; AVX512DQ-FAST: {{.*}} ; AVX512DQ-SLOW: {{.*}} ; AVX512DQBW-FAST: {{.*}} ; AVX512DQBW-SLOW: {{.*}} +; AVX512F: {{.*}} ; AVX512F-ONLY-FAST: {{.*}} ; AVX512F-ONLY-SLOW: {{.*}} ; FALLBACK0: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index 4a7d8c1..ac65790 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -465,13 +465,13 @@ define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_z define <8 x i16> @pr32967(<32 x i16> %v) { ; KNL-LABEL: pr32967: ; KNL: ## %bb.0: -; KNL-NEXT: vpsrld $16, %zmm0, %zmm0 +; KNL-NEXT: vpsrlq $16, %zmm0, %zmm0 ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: pr32967: ; SKX: ## %bb.0: -; SKX-NEXT: vpsrld $16, %zmm0, %zmm0 +; SKX-NEXT: vpsrlq $16, %zmm0, %zmm0 ; SKX-NEXT: vpmovqw %zmm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 490bd21..ef9fe2c 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -529,7 +529,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm1 -; AVX512-NEXT: vpsrlw $8, %zmm0, %zmm2 +; AVX512-NEXT: vpsrld $8, %zmm0, %zmm2 ; AVX512-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512-NEXT: vpsrld $16, %zmm0, %zmm3 ; AVX512-NEXT: vpmovdb %zmm3, %xmm3 -- 2.7.4