From fed302ae37ec56badc8283e39070561e47ae740e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 28 Apr 2019 10:02:34 +0000 Subject: [PATCH] [X86][AVX] Add AVX512DQ coverage for masked memory ops tests (PR34584) llvm-svn: 359395 --- llvm/test/CodeGen/X86/masked_compressstore.ll | 357 ++++++++- llvm/test/CodeGen/X86/masked_expandload.ll | 354 ++++++++- llvm/test/CodeGen/X86/masked_load.ll | 1015 +++++++++++++++++++++++-- llvm/test/CodeGen/X86/masked_store.ll | 874 +++++++++++++++++++-- 4 files changed, 2444 insertions(+), 156 deletions(-) diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll index 3542bc9..6ee8779 100644 --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW ; ; vXf64 @@ -266,6 +267,15 @@ define void @compressstore_v8f64_v8i1(double* %base, <8 x double> %V, <8 x i1> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v8f64_v8i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512VLDQ-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k1 +; AVX512VLDQ-NEXT: vcompresspd %zmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v8f64_v8i1: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $15, %xmm1, %xmm1 @@ -789,6 +799,33 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v16f64_v16i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; AVX512VLDQ-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512VLDQ-NEXT: kmovb %k1, %eax +; AVX512VLDQ-NEXT: movl %eax, %ecx +; AVX512VLDQ-NEXT: shrl %ecx +; AVX512VLDQ-NEXT: andl $-43, %ecx +; AVX512VLDQ-NEXT: subl %ecx, %eax +; AVX512VLDQ-NEXT: movl %eax, %ecx +; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333 +; AVX512VLDQ-NEXT: shrl $2, %eax +; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333 +; AVX512VLDQ-NEXT: addl %ecx, %eax +; AVX512VLDQ-NEXT: movl %eax, %ecx +; AVX512VLDQ-NEXT: shrl $4, %ecx +; AVX512VLDQ-NEXT: addl %eax, %ecx +; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F +; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; AVX512VLDQ-NEXT: shrl $24, %eax +; AVX512VLDQ-NEXT: kshiftrw $8, %k1, %k2 +; AVX512VLDQ-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} +; AVX512VLDQ-NEXT: vcompresspd %zmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v16f64_v16i1: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $7, %xmm2, %xmm2 @@ -919,13 +956,13 @@ define void @compressstore_v2f32_v2i32(float* %base, <2 x float> %V, <2 x i32> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: compressstore_v2f32_v2i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VLBW-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; AVX512VLBW-NEXT: vcompressps %xmm0, (%rdi) {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: compressstore_v2f32_v2i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; AVX512VL-NEXT: vcompressps %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.compressstore.v2f32(<2 x float> %V, float* %base, <2 x i1> %mask) ret void @@ -1041,6 +1078,13 @@ define void @compressstore_v4f32_v4i1(float* %base, <4 x float> %V, <4 x i1> %ma ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v4f32_v4i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1 +; AVX512VLDQ-NEXT: vcompressps %xmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v4f32_v4i1: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1 @@ -1254,6 +1298,15 @@ define void @compressstore_v8f32_v8i1(float* %base, <8 x float> %V, <8 x i1> %ma ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v8f32_v8i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512VLDQ-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k1 +; AVX512VLDQ-NEXT: vcompressps %ymm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v8f32_v8i1: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $15, %xmm1, %xmm1 @@ -1347,6 +1400,14 @@ define void @compressstore_v16f32_const(float* %base, <16 x float> %V) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v16f32_const: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movw $-2049, %ax ## imm = 0xF7FF +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vcompressps %zmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v16f32_const: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movw $-2049, %ax ## imm = 0xF7FF @@ -2730,6 +2791,13 @@ define void @compressstore_v2i64_v2i1(i64* %base, <2 x i64> %V, <2 x i1> %mask) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v2i64_v2i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpsllq $63, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vpmovq2m %xmm1, %k1 +; AVX512VLDQ-NEXT: vpcompressq %xmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v2i64_v2i1: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllq $63, %xmm1, %xmm1 @@ -2884,6 +2952,14 @@ define void @compressstore_v4i64_v4i1(i64* %base, <4 x i64> %V, <4 x i1> %mask) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v4i64_v4i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1 +; AVX512VLDQ-NEXT: vpcompressq %ymm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v4i64_v4i1: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1 @@ -3155,6 +3231,15 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v8i64_v8i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512VLDQ-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k1 +; AVX512VLDQ-NEXT: vpcompressq %zmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v8i64_v8i1: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $15, %xmm1, %xmm1 @@ -3290,11 +3375,11 @@ define void @compressstore_v4i32_v4i32(i32* %base, <4 x i32> %V, <4 x i32> %trig ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: compressstore_v4i32_v4i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; AVX512VLBW-NEXT: vpcompressd %xmm0, (%rdi) {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: compressstore_v4i32_v4i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmd %xmm1, %xmm1, %k1 +; AVX512VL-NEXT: vpcompressd %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer call void @llvm.masked.compressstore.v4i32(<4 x i32> %V, i32* %base, <4 x i1> %mask) ret void @@ -3597,6 +3682,89 @@ define void @compressstore_v8i16_v8i16(i16* %base, <8 x i16> %V, <8 x i16> %trig ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v8i16_v8i16: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store +; AVX512VLDQ-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_2: ## %else +; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: vpextrw $1, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store4 +; AVX512VLDQ-NEXT: vpextrw $2, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_6: ## %else5 +; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store7 +; AVX512VLDQ-NEXT: vpextrw $3, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_8: ## %else8 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kshiftrb $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store10 +; AVX512VLDQ-NEXT: vpextrw $4, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_10: ## %else11 +; AVX512VLDQ-NEXT: kshiftrb $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store13 +; AVX512VLDQ-NEXT: vpextrw $5, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_12: ## %else14 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store16 +; AVX512VLDQ-NEXT: vpextrw $6, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_14: ## %else17 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store19 +; AVX512VLDQ-NEXT: vpextrw $7, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: LBB11_16: ## %else20 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v8i16_v8i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmw %xmm1, %xmm1, %k0 @@ -4249,6 +4417,169 @@ define void @compressstore_v16i8_v16i8(i8* %base, <16 x i8> %V, <16 x i8> %trigg ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: compressstore_v16i8_v16i8: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store +; AVX512VLDQ-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_2: ## %else +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: vpextrb $1, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store4 +; AVX512VLDQ-NEXT: vpextrb $2, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_6: ## %else5 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store7 +; AVX512VLDQ-NEXT: vpextrb $3, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_8: ## %else8 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store10 +; AVX512VLDQ-NEXT: vpextrb $4, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_10: ## %else11 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store13 +; AVX512VLDQ-NEXT: vpextrb $5, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_12: ## %else14 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store16 +; AVX512VLDQ-NEXT: vpextrb $6, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_14: ## %else17 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store19 +; AVX512VLDQ-NEXT: vpextrb $7, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_16: ## %else20 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_18 +; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store22 +; AVX512VLDQ-NEXT: vpextrb $8, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_18: ## %else23 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_20 +; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store25 +; AVX512VLDQ-NEXT: vpextrb $9, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_20: ## %else26 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_22 +; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store28 +; AVX512VLDQ-NEXT: vpextrb $10, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_22: ## %else29 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_24 +; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store31 +; AVX512VLDQ-NEXT: vpextrb $11, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_24: ## %else32 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_26 +; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store34 +; AVX512VLDQ-NEXT: vpextrb $12, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_26: ## %else35 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_28 +; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store37 +; AVX512VLDQ-NEXT: vpextrb $13, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_28: ## %else38 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_30 +; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store40 +; AVX512VLDQ-NEXT: vpextrb $14, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_30: ## %else41 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_32 +; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store43 +; AVX512VLDQ-NEXT: vpextrb $15, %xmm0, (%rdi) +; AVX512VLDQ-NEXT: LBB12_32: ## %else44 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: compressstore_v16i8_v16i8: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmb %xmm1, %xmm1, %k0 diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index 5b89d34..351e040 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW ; ; vXf64 @@ -82,11 +83,11 @@ define <2 x double> @expandload_v2f64_v2i64(double* %base, <2 x double> %src0, < ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: expandload_v2f64_v2i64: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; AVX512VLBW-NEXT: vexpandpd (%rdi), %xmm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: expandload_v2f64_v2i64: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; AVX512VL-NEXT: vexpandpd (%rdi), %xmm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i64> %trigger, zeroinitializer %res = call <2 x double> @llvm.masked.expandload.v2f64(double* %base, <2 x i1> %mask, <2 x double> %src0) ret <2 x double>%res @@ -263,11 +264,11 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, < ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: expandload_v4f64_v4i64: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmq %ymm1, %ymm1, %k1 -; AVX512VLBW-NEXT: vexpandpd (%rdi), %ymm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: expandload_v4f64_v4i64: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmq %ymm1, %ymm1, %k1 +; AVX512VL-NEXT: vexpandpd (%rdi), %ymm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i64> %trigger, zeroinitializer %res = call <4 x double> @llvm.masked.expandload.v4f64(double* %base, <4 x i1> %mask, <4 x double> %src0) ret <4 x double>%res @@ -471,6 +472,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8 ; AVX512F-NEXT: vexpandpd (%rdi), %zmm0 {%k1} ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: expandload_v8f64_v8i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512VLDQ-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k1 +; AVX512VLDQ-NEXT: vexpandpd (%rdi), %zmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: expandload_v8f64_v8i1: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $15, %xmm1, %xmm1 @@ -1180,6 +1189,31 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src ; AVX512F-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: expandload_v16f64_v16i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512VLDQ-NEXT: vptestnmd %ymm3, %ymm3, %k1 +; AVX512VLDQ-NEXT: vptestnmd %ymm2, %ymm2, %k2 +; AVX512VLDQ-NEXT: kmovb %k2, %eax +; AVX512VLDQ-NEXT: movl %eax, %ecx +; AVX512VLDQ-NEXT: shrl %ecx +; AVX512VLDQ-NEXT: andl $-43, %ecx +; AVX512VLDQ-NEXT: subl %ecx, %eax +; AVX512VLDQ-NEXT: movl %eax, %ecx +; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333 +; AVX512VLDQ-NEXT: shrl $2, %eax +; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333 +; AVX512VLDQ-NEXT: addl %ecx, %eax +; AVX512VLDQ-NEXT: movl %eax, %ecx +; AVX512VLDQ-NEXT: shrl $4, %ecx +; AVX512VLDQ-NEXT: addl %eax, %ecx +; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F +; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 +; AVX512VLDQ-NEXT: shrl $24, %eax +; AVX512VLDQ-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} +; AVX512VLDQ-NEXT: vexpandpd (%rdi), %zmm0 {%k2} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: expandload_v16f64_v16i32: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vextracti64x4 $1, %zmm2, %ymm3 @@ -1317,13 +1351,13 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: expandload_v2f32_v2i1: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VLBW-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; AVX512VLBW-NEXT: vexpandps (%rdi), %xmm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: expandload_v2f32_v2i1: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1 +; AVX512VL-NEXT: vexpandps (%rdi), %xmm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.expandload.v2f32(float* %base, <2 x i1> %mask, <2 x float> %src0) ret <2 x float> %res @@ -1367,6 +1401,13 @@ define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: expandload_v4f32_const: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $7, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vexpandps (%rdi), %xmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: expandload_v4f32_const: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $7, %al @@ -1444,6 +1485,13 @@ define <16 x float> @expandload_v16f32_const(float* %base, <16 x float> %src0) { ; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: expandload_v16f32_const: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movw $30719, %ax ## imm = 0x77FF +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: expandload_v16f32_const: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movw $30719, %ax ## imm = 0x77FF @@ -1489,6 +1537,13 @@ define <16 x float> @expandload_v16f32_const_undef(float* %base) { ; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: expandload_v16f32_const_undef: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movw $-2049, %ax ## imm = 0xF7FF +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: expandload_v16f32_const_undef: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movw $-2049, %ax ## imm = 0xF7FF @@ -2954,6 +3009,13 @@ define <2 x i64> @expandload_v2i64_const(i64* %base, <2 x i64> %src0) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: expandload_v2i64_const: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $2, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vpexpandq (%rdi), %xmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: expandload_v2i64_const: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $2, %al @@ -3094,11 +3156,11 @@ define <4 x i32> @expandload_v4i32_v4i32(i32* %base, <4 x i32> %src0, <4 x i32> ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: expandload_v4i32_v4i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; AVX512VLBW-NEXT: vpexpandd (%rdi), %xmm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: expandload_v4i32_v4i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmd %xmm1, %xmm1, %k1 +; AVX512VL-NEXT: vpexpandd (%rdi), %xmm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x i32> @llvm.masked.expandload.v4i32(i32* %base, <4 x i1> %mask, <4 x i32> %src0) ret <4 x i32>%res @@ -3393,6 +3455,89 @@ define <8 x i16> @expandload_v8i16_v8i16(i16* %base, <8 x i16> %src0, <8 x i16> ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: expandload_v8i16_v8i16: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load +; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_2: ## %else +; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 +; AVX512VLDQ-NEXT: vpinsrw $1, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load5 +; AVX512VLDQ-NEXT: vpinsrw $2, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_6: ## %else6 +; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load9 +; AVX512VLDQ-NEXT: vpinsrw $3, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_8: ## %else10 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kshiftrb $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load13 +; AVX512VLDQ-NEXT: vpinsrw $4, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_10: ## %else14 +; AVX512VLDQ-NEXT: kshiftrb $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load17 +; AVX512VLDQ-NEXT: vpinsrw $5, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_12: ## %else18 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load21 +; AVX512VLDQ-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: addq $2, %rdi +; AVX512VLDQ-NEXT: LBB11_14: ## %else22 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB11_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load25 +; AVX512VLDQ-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: LBB11_16: ## %else26 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: expandload_v8i16_v8i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmw %xmm1, %xmm1, %k0 @@ -4120,6 +4265,169 @@ define <16 x i8> @expandload_v16i8_v16i8(i8* %base, <16 x i8> %src0, <16 x i8> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: expandload_v16i8_v16i8: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load +; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_2: ## %else +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 +; AVX512VLDQ-NEXT: vpinsrb $1, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load5 +; AVX512VLDQ-NEXT: vpinsrb $2, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_6: ## %else6 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load9 +; AVX512VLDQ-NEXT: vpinsrb $3, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_8: ## %else10 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load13 +; AVX512VLDQ-NEXT: vpinsrb $4, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_10: ## %else14 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load17 +; AVX512VLDQ-NEXT: vpinsrb $5, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_12: ## %else18 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load21 +; AVX512VLDQ-NEXT: vpinsrb $6, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_14: ## %else22 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load25 +; AVX512VLDQ-NEXT: vpinsrb $7, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_16: ## %else26 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_18 +; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.load29 +; AVX512VLDQ-NEXT: vpinsrb $8, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_18: ## %else30 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_20 +; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.load33 +; AVX512VLDQ-NEXT: vpinsrb $9, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_20: ## %else34 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_22 +; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.load37 +; AVX512VLDQ-NEXT: vpinsrb $10, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_22: ## %else38 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_24 +; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.load41 +; AVX512VLDQ-NEXT: vpinsrb $11, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_24: ## %else42 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_26 +; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.load45 +; AVX512VLDQ-NEXT: vpinsrb $12, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_26: ## %else46 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_28 +; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.load49 +; AVX512VLDQ-NEXT: vpinsrb $13, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_28: ## %else50 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_30 +; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.load53 +; AVX512VLDQ-NEXT: vpinsrb $14, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: incq %rdi +; AVX512VLDQ-NEXT: LBB12_30: ## %else54 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB12_32 +; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.load57 +; AVX512VLDQ-NEXT: vpinsrb $15, (%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: LBB12_32: ## %else58 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: expandload_v16i8_v16i8: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmb %xmm1, %xmm1, %k0 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 7c4d970..4767669 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ +; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW ; ; vXf64 @@ -94,11 +95,11 @@ define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, < ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v2f64_v2i64: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v2f64_v2i64: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i64> %trigger, zeroinitializer %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst) ret <2 x double> %res @@ -205,11 +206,11 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, < ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v4f64_v4i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v4f64_v4i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double> %dst) ret <4 x double> %res @@ -317,11 +318,11 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %ad ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v4f64_v4i32_zero: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v4f64_v4i32_zero: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double>zeroinitializer) ret <4 x double> %res @@ -430,11 +431,11 @@ define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, < ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v4f64_v4i64: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; AVX512VLBW-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v4f64_v4i64: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i64> %trigger, zeroinitializer %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> %mask, <4 x double> %dst) ret <4 x double> %res @@ -614,6 +615,15 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, < ; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: load_v8f64_v8i16: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; AVX512VLDQ-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: load_v8f64_v8i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1 @@ -888,13 +898,13 @@ define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v2f32_v2i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v2f32_v2i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst) ret <2 x float> %res @@ -977,13 +987,13 @@ define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, <2 x float>* %add ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v2f32_v2i32_undef: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v2f32_v2i32_undef: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float>undef) ret <2 x float> %res @@ -1085,11 +1095,11 @@ define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, <4 x float>* %addr, <4 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v4f32_v4i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v4f32_v4i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> %mask, <4 x float> %dst) ret <4 x float> %res @@ -1253,6 +1263,14 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, <8 x float>* %addr) { ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: load_v8f32_v8i1_zero: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; AVX512VLDQ-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: load_v8f32_v8i1_zero: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0 @@ -1440,11 +1458,11 @@ define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, <8 x float>* %addr, <8 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v8f32_v8i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VLBW-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v8f32_v8i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> %dst) ret <8 x float> %res @@ -1552,11 +1570,11 @@ define <2 x i64> @load_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i6 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v2i64_v2i64: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vpblendmq (%rdi), %xmm1, %xmm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v2i64_v2i64: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpblendmq (%rdi), %xmm1, %xmm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i64> %trigger, zeroinitializer %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> %mask, <2 x i64> %dst) ret <2 x i64> %res @@ -1667,11 +1685,11 @@ define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i6 ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v4i64_v4i64: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmq %ymm0, %ymm0, %k1 -; AVX512VLBW-NEXT: vpblendmq (%rdi), %ymm1, %ymm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v4i64_v4i64: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vpblendmq (%rdi), %ymm1, %ymm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i64> %trigger, zeroinitializer %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> %mask, <4 x i64> %dst) ret <4 x i64> %res @@ -1855,6 +1873,15 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6 ; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: load_v8i64_v8i16: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; AVX512VLDQ-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: load_v8i64_v8i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1 @@ -2138,15 +2165,15 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v2i32_v2i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v2i32_v2i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX512VL-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst) ret <2 x i32> %res @@ -2255,11 +2282,11 @@ define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i3 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: load_v4i32_v4i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: load_v4i32_v4i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst) ret <4 x i32> %res @@ -2422,6 +2449,14 @@ define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %d ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: load_v8i32_v8i1: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; AVX512VLDQ-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: load_v8i32_v8i1: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0 @@ -2589,6 +2624,14 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, <8 x i32>* %addr) { ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: load_v8i32_v8i1_zero: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k1 +; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: load_v8i32_v8i1_zero: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpsllw $15, %xmm0, %xmm0 @@ -2864,6 +2907,83 @@ define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, <8 x i16>* %addr, <8 x i1 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: load_v8i16_v8i16: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB21_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load +; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB21_2: ## %else +; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB21_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 +; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB21_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB21_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load4 +; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB21_6: ## %else5 +; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB21_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load7 +; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB21_8: ## %else8 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kshiftrb $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB21_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load10 +; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB21_10: ## %else11 +; AVX512VLDQ-NEXT: kshiftrb $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB21_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load13 +; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB21_12: ## %else14 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB21_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load16 +; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB21_14: ## %else17 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB21_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load19 +; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB21_16: ## %else20 +; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: load_v8i16_v8i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1 @@ -3574,6 +3694,178 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: load_v16i16_v16i16: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load +; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB22_2: ## %else +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 +; AVX512VLDQ-NEXT: vpinsrw $1, 2(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB22_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load4 +; AVX512VLDQ-NEXT: vpinsrw $2, 4(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB22_6: ## %else5 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load7 +; AVX512VLDQ-NEXT: vpinsrw $3, 6(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB22_8: ## %else8 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load10 +; AVX512VLDQ-NEXT: vpinsrw $4, 8(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB22_10: ## %else11 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load13 +; AVX512VLDQ-NEXT: vpinsrw $5, 10(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB22_12: ## %else14 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load16 +; AVX512VLDQ-NEXT: vpinsrw $6, 12(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB22_14: ## %else17 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load19 +; AVX512VLDQ-NEXT: vpinsrw $7, 14(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB22_16: ## %else20 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_18 +; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.load22 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrw $0, 16(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB22_18: ## %else23 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_20 +; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.load25 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrw $1, 18(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB22_20: ## %else26 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_22 +; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.load28 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrw $2, 20(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB22_22: ## %else29 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_24 +; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.load31 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrw $3, 22(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB22_24: ## %else32 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_26 +; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.load34 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrw $4, 24(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB22_26: ## %else35 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_28 +; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.load37 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrw $5, 26(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB22_28: ## %else38 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_30 +; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.load40 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrw $6, 28(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB22_30: ## %else41 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB22_32 +; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.load43 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrw $7, 30(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB22_32: ## %else44 +; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: load_v16i16_v16i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpmovw2m %ymm0, %k1 @@ -4184,6 +4476,155 @@ define <16 x i8> @load_v16i8_v16i8(<16 x i8> %trigger, <16 x i8>* %addr, <16 x i ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: load_v16i8_v16i8: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load +; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_2: ## %else +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 +; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load4 +; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_6: ## %else5 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load7 +; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_8: ## %else8 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load10 +; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_10: ## %else11 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load13 +; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_12: ## %else14 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load16 +; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_14: ## %else17 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load19 +; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_16: ## %else20 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_18 +; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.load22 +; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_18: ## %else23 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_20 +; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.load25 +; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_20: ## %else26 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_22 +; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.load28 +; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_22: ## %else29 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_24 +; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.load31 +; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_24: ## %else32 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_26 +; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.load34 +; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_26: ## %else35 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_28 +; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.load37 +; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_28: ## %else38 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_30 +; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.load40 +; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_30: ## %else41 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB23_32 +; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.load43 +; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm1 +; AVX512VLDQ-NEXT: LBB23_32: ## %else44 +; AVX512VLDQ-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: load_v16i8_v16i8: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpmovb2m %xmm0, %k1 @@ -5779,6 +6220,369 @@ define <32 x i8> @load_v32i8_v32i8(<32 x i8> %trigger, <32 x i8>* %addr, <32 x i ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: load_v32i8_v32i8: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load +; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm1, %xmm3 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_2: ## %else +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1 +; AVX512VLDQ-NEXT: vpinsrb $1, 1(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load4 +; AVX512VLDQ-NEXT: vpinsrb $2, 2(%rdi), %xmm1, %xmm3 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_6: ## %else5 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load7 +; AVX512VLDQ-NEXT: vpinsrb $3, 3(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_8: ## %else8 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load10 +; AVX512VLDQ-NEXT: vpinsrb $4, 4(%rdi), %xmm1, %xmm3 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_10: ## %else11 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load13 +; AVX512VLDQ-NEXT: vpinsrb $5, 5(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_12: ## %else14 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load16 +; AVX512VLDQ-NEXT: vpinsrb $6, 6(%rdi), %xmm1, %xmm3 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_14: ## %else17 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load19 +; AVX512VLDQ-NEXT: vpinsrb $7, 7(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_16: ## %else20 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_18 +; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.load22 +; AVX512VLDQ-NEXT: vpinsrb $8, 8(%rdi), %xmm1, %xmm3 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_18: ## %else23 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_20 +; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.load25 +; AVX512VLDQ-NEXT: vpinsrb $9, 9(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_20: ## %else26 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_22 +; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.load28 +; AVX512VLDQ-NEXT: vpinsrb $10, 10(%rdi), %xmm1, %xmm3 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_22: ## %else29 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_24 +; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.load31 +; AVX512VLDQ-NEXT: vpinsrb $11, 11(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_24: ## %else32 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_26 +; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.load34 +; AVX512VLDQ-NEXT: vpinsrb $12, 12(%rdi), %xmm1, %xmm3 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_26: ## %else35 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_28 +; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.load37 +; AVX512VLDQ-NEXT: vpinsrb $13, 13(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_28: ## %else38 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_30 +; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.load40 +; AVX512VLDQ-NEXT: vpinsrb $14, 14(%rdi), %xmm1, %xmm3 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_30: ## %else41 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_32 +; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.load43 +; AVX512VLDQ-NEXT: vpinsrb $15, 15(%rdi), %xmm1, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLDQ-NEXT: LBB24_32: ## %else44 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_34 +; AVX512VLDQ-NEXT: ## %bb.33: ## %cond.load46 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $0, 16(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_34: ## %else47 +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_36 +; AVX512VLDQ-NEXT: ## %bb.35: ## %cond.load49 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $1, 17(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_36: ## %else50 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_38 +; AVX512VLDQ-NEXT: ## %bb.37: ## %cond.load52 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $2, 18(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_38: ## %else53 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_40 +; AVX512VLDQ-NEXT: ## %bb.39: ## %cond.load55 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $3, 19(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_40: ## %else56 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_42 +; AVX512VLDQ-NEXT: ## %bb.41: ## %cond.load58 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $4, 20(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_42: ## %else59 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_44 +; AVX512VLDQ-NEXT: ## %bb.43: ## %cond.load61 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $5, 21(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_44: ## %else62 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_46 +; AVX512VLDQ-NEXT: ## %bb.45: ## %cond.load64 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $6, 22(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_46: ## %else65 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_48 +; AVX512VLDQ-NEXT: ## %bb.47: ## %cond.load67 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $7, 23(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_48: ## %else68 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_50 +; AVX512VLDQ-NEXT: ## %bb.49: ## %cond.load70 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $8, 24(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_50: ## %else71 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_52 +; AVX512VLDQ-NEXT: ## %bb.51: ## %cond.load73 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $9, 25(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_52: ## %else74 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_54 +; AVX512VLDQ-NEXT: ## %bb.53: ## %cond.load76 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $10, 26(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_54: ## %else77 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_56 +; AVX512VLDQ-NEXT: ## %bb.55: ## %cond.load79 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $11, 27(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_56: ## %else80 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_58 +; AVX512VLDQ-NEXT: ## %bb.57: ## %cond.load82 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $12, 28(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_58: ## %else83 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_60 +; AVX512VLDQ-NEXT: ## %bb.59: ## %cond.load85 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpinsrb $13, 29(%rdi), %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_60: ## %else86 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_62 +; AVX512VLDQ-NEXT: ## %bb.61: ## %cond.load88 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $14, 30(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_62: ## %else89 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB24_64 +; AVX512VLDQ-NEXT: ## %bb.63: ## %cond.load91 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpinsrb $15, 31(%rdi), %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512VLDQ-NEXT: LBB24_64: ## %else92 +; AVX512VLDQ-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: load_v32i8_v32i8: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpmovb2m %ymm0, %k1 @@ -5829,6 +6633,13 @@ define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: mload_constmask_v4f32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $13, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovups (%rdi), %xmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: mload_constmask_v4f32: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $13, %al @@ -5859,11 +6670,11 @@ define <4 x float> @mload_constmask_v4f32_all(<4 x float>* %addr) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: mload_constmask_v4f32_all: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: mload_constmask_v4f32_all: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: kxnorw %k0, %k0, %k1 +; AVX512VL-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} +; AVX512VL-NEXT: retq %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> , <4 x float>undef) ret <4 x float> %res } @@ -5929,6 +6740,13 @@ define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: mload_constmask_v4i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $14, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: mload_constmask_v4i32: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $14, %al @@ -5997,6 +6815,13 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: mload_constmask_v8f32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $7, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovups (%rdi), %ymm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: mload_constmask_v8f32: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $7, %al @@ -6030,6 +6855,13 @@ define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %ds ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: mload_constmask_v4f64: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $7, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: mload_constmask_v4f64: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $7, %al @@ -6079,6 +6911,13 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) { ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: mload_constmask_v8i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $-121, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: mload_constmask_v8i32: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $-121, %al @@ -6117,6 +6956,13 @@ define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) { ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: mload_constmask_v4i64: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $9, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: mload_constmask_v4i64: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $9, %al @@ -6150,6 +6996,13 @@ define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %ds ; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1} ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: mload_constmask_v8f64: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $-121, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovupd (%rdi), %zmm0 {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: mload_constmask_v8f64: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $-121, %al @@ -6183,6 +7036,13 @@ define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: mload_constmask_v4f64_undef_passthrough: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $7, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: mload_constmask_v4f64_undef_passthrough: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $7, %al @@ -6221,6 +7081,13 @@ define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) { ; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: mload_constmask_v4i64_undef_passthrough: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: movb $6, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k1 +; AVX512VLDQ-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: mload_constmask_v4i64_undef_passthrough: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: movb $6, %al diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index 71a94c6..efbb1ef 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW ; ; vXf64 @@ -94,6 +95,12 @@ define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x dou ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v2f64_v2i64: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovq2m %xmm0, %k1 +; AVX512VLDQ-NEXT: vmovupd %xmm1, (%rdi) {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v2f64_v2i64: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -201,6 +208,13 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v4f64_v4i64: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovq2m %ymm0, %k1 +; AVX512VLDQ-NEXT: vmovupd %ymm1, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v4f64_v4i64: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -289,21 +303,18 @@ define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x floa ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: store_v2f32_v2i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: store_v2f32_v2i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovups %xmm1, (%rdi) {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask) ret void } -; PR34584: The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed. -; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that. - define void @store_v4f32_v4i32(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) { ; SSE2-LABEL: store_v4f32_v4i32: ; SSE2: ## %bb.0: @@ -391,6 +402,12 @@ define void @store_v4f32_v4i32(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v4f32_v4i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovd2m %xmm2, %k1 +; AVX512VLDQ-NEXT: vmovups %xmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v4f32_v4i32: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -555,6 +572,13 @@ define void @store_v8f32_v8i32(<8 x float> %x, <8 x float>* %ptr, <8 x float> %y ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v8f32_v8i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k1 +; AVX512VLDQ-NEXT: vmovups %ymm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v8f32_v8i32: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -835,13 +859,28 @@ define void @store_v16f32_v16i32(<16 x float> %x, <16 x float>* %ptr, <16 x floa ; AVX1OR2-NEXT: vzeroupper ; AVX1OR2-NEXT: retq ; -; AVX512-LABEL: store_v16f32_v16i32: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k1 -; AVX512-NEXT: vmovups %zmm0, (%rdi) {%k1} -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: store_v16f32_v16i32: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k1 +; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VLDQ-LABEL: store_v16f32_v16i32: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k1 +; AVX512VLDQ-NEXT: vmovups %zmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; +; AVX512VLBW-LABEL: store_v16f32_v16i32: +; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpcmpgtd %zmm2, %zmm1, %k1 +; AVX512VLBW-NEXT: vmovups %zmm0, (%rdi) {%k1} +; AVX512VLBW-NEXT: vzeroupper +; AVX512VLBW-NEXT: retq %bool_mask = icmp slt <16 x i32> %mask, zeroinitializer call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %x, <16 x float>* %ptr, i32 1, <16 x i1> %bool_mask) ret void @@ -918,6 +957,12 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v2i64_v2i64: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovq2m %xmm0, %k1 +; AVX512VLDQ-NEXT: vmovdqu64 %xmm1, (%rdi) {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v2i64_v2i64: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1033,6 +1078,13 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v4i64_v4i64: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpmovq2m %ymm0, %k1 +; AVX512VLDQ-NEXT: vmovdqu64 %ymm1, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v4i64_v4i64: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -1146,13 +1198,13 @@ define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: store_v2i32_v2i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vpmovqd %xmm1, (%rdi) {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: store_v2i32_v2i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpmovqd %xmm1, (%rdi) {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask) ret void @@ -1251,11 +1303,11 @@ define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: store_v4i32_v4i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k1 -; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: store_v4i32_v4i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask) ret void @@ -1422,12 +1474,12 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: store_v8i32_v8i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1 -; AVX512VLBW-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} -; AVX512VLBW-NEXT: vzeroupper -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: store_v8i32_v8i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %val, <8 x i32>* %addr, i32 4, <8 x i1> %mask) ret void @@ -1702,6 +1754,82 @@ define void @store_v8i16_v8i16(<8 x i16> %trigger, <8 x i16>* %addr, <8 x i16> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v8i16_v8i16: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB13_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store +; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, (%rdi) +; AVX512VLDQ-NEXT: LBB13_2: ## %else +; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB13_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX512VLDQ-NEXT: LBB13_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB13_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3 +; AVX512VLDQ-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX512VLDQ-NEXT: LBB13_6: ## %else4 +; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB13_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5 +; AVX512VLDQ-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX512VLDQ-NEXT: LBB13_8: ## %else6 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2 +; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0 +; AVX512VLDQ-NEXT: kshiftrb $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB13_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7 +; AVX512VLDQ-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX512VLDQ-NEXT: LBB13_10: ## %else8 +; AVX512VLDQ-NEXT: kshiftrb $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB13_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9 +; AVX512VLDQ-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX512VLDQ-NEXT: LBB13_12: ## %else10 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB13_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11 +; AVX512VLDQ-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX512VLDQ-NEXT: LBB13_14: ## %else12 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB13_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13 +; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX512VLDQ-NEXT: LBB13_16: ## %else14 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v8i16_v8i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1 @@ -2376,6 +2504,162 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <16 x i ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v16i16_v16i16: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store +; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, (%rdi) +; AVX512VLDQ-NEXT: LBB14_2: ## %else +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: vpextrw $1, %xmm1, 2(%rdi) +; AVX512VLDQ-NEXT: LBB14_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3 +; AVX512VLDQ-NEXT: vpextrw $2, %xmm1, 4(%rdi) +; AVX512VLDQ-NEXT: LBB14_6: ## %else4 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5 +; AVX512VLDQ-NEXT: vpextrw $3, %xmm1, 6(%rdi) +; AVX512VLDQ-NEXT: LBB14_8: ## %else6 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7 +; AVX512VLDQ-NEXT: vpextrw $4, %xmm1, 8(%rdi) +; AVX512VLDQ-NEXT: LBB14_10: ## %else8 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9 +; AVX512VLDQ-NEXT: vpextrw $5, %xmm1, 10(%rdi) +; AVX512VLDQ-NEXT: LBB14_12: ## %else10 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11 +; AVX512VLDQ-NEXT: vpextrw $6, %xmm1, 12(%rdi) +; AVX512VLDQ-NEXT: LBB14_14: ## %else12 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13 +; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi) +; AVX512VLDQ-NEXT: LBB14_16: ## %else14 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_18 +; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store15 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrw $0, %xmm2, 16(%rdi) +; AVX512VLDQ-NEXT: LBB14_18: ## %else16 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_20 +; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store17 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrw $1, %xmm2, 18(%rdi) +; AVX512VLDQ-NEXT: LBB14_20: ## %else18 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_22 +; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store19 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrw $2, %xmm2, 20(%rdi) +; AVX512VLDQ-NEXT: LBB14_22: ## %else20 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_24 +; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store21 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrw $3, %xmm2, 22(%rdi) +; AVX512VLDQ-NEXT: LBB14_24: ## %else22 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_26 +; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store23 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrw $4, %xmm2, 24(%rdi) +; AVX512VLDQ-NEXT: LBB14_26: ## %else24 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_28 +; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store25 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrw $5, %xmm2, 26(%rdi) +; AVX512VLDQ-NEXT: LBB14_28: ## %else26 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_30 +; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store27 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpextrw $6, %xmm0, 28(%rdi) +; AVX512VLDQ-NEXT: LBB14_30: ## %else28 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB14_32 +; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store29 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpextrw $7, %xmm0, 30(%rdi) +; AVX512VLDQ-NEXT: LBB14_32: ## %else30 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v16i16_v16i16: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmw %ymm0, %ymm0, %k1 @@ -2908,6 +3192,154 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, <16 x i8>* %addr, <16 x i8> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v16i8_v16i8: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store +; AVX512VLDQ-NEXT: vpextrb $0, %xmm1, (%rdi) +; AVX512VLDQ-NEXT: LBB15_2: ## %else +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: vpextrb $1, %xmm1, 1(%rdi) +; AVX512VLDQ-NEXT: LBB15_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3 +; AVX512VLDQ-NEXT: vpextrb $2, %xmm1, 2(%rdi) +; AVX512VLDQ-NEXT: LBB15_6: ## %else4 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5 +; AVX512VLDQ-NEXT: vpextrb $3, %xmm1, 3(%rdi) +; AVX512VLDQ-NEXT: LBB15_8: ## %else6 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7 +; AVX512VLDQ-NEXT: vpextrb $4, %xmm1, 4(%rdi) +; AVX512VLDQ-NEXT: LBB15_10: ## %else8 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9 +; AVX512VLDQ-NEXT: vpextrb $5, %xmm1, 5(%rdi) +; AVX512VLDQ-NEXT: LBB15_12: ## %else10 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11 +; AVX512VLDQ-NEXT: vpextrb $6, %xmm1, 6(%rdi) +; AVX512VLDQ-NEXT: LBB15_14: ## %else12 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13 +; AVX512VLDQ-NEXT: vpextrb $7, %xmm1, 7(%rdi) +; AVX512VLDQ-NEXT: LBB15_16: ## %else14 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_18 +; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store15 +; AVX512VLDQ-NEXT: vpextrb $8, %xmm1, 8(%rdi) +; AVX512VLDQ-NEXT: LBB15_18: ## %else16 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_20 +; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store17 +; AVX512VLDQ-NEXT: vpextrb $9, %xmm1, 9(%rdi) +; AVX512VLDQ-NEXT: LBB15_20: ## %else18 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_22 +; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store19 +; AVX512VLDQ-NEXT: vpextrb $10, %xmm1, 10(%rdi) +; AVX512VLDQ-NEXT: LBB15_22: ## %else20 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_24 +; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store21 +; AVX512VLDQ-NEXT: vpextrb $11, %xmm1, 11(%rdi) +; AVX512VLDQ-NEXT: LBB15_24: ## %else22 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_26 +; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store23 +; AVX512VLDQ-NEXT: vpextrb $12, %xmm1, 12(%rdi) +; AVX512VLDQ-NEXT: LBB15_26: ## %else24 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_28 +; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store25 +; AVX512VLDQ-NEXT: vpextrb $13, %xmm1, 13(%rdi) +; AVX512VLDQ-NEXT: LBB15_28: ## %else26 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_30 +; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store27 +; AVX512VLDQ-NEXT: vpextrb $14, %xmm1, 14(%rdi) +; AVX512VLDQ-NEXT: LBB15_30: ## %else28 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB15_32 +; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store29 +; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX512VLDQ-NEXT: LBB15_32: ## %else30 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v16i8_v16i8: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmb %xmm0, %xmm0, %k1 @@ -4253,6 +4685,337 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, <32 x i8>* %addr, <32 x i8> % ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: store_v32i8_v32i8: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_2 +; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store +; AVX512VLDQ-NEXT: vpextrb $0, %xmm1, (%rdi) +; AVX512VLDQ-NEXT: LBB16_2: ## %else +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_4 +; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1 +; AVX512VLDQ-NEXT: vpextrb $1, %xmm1, 1(%rdi) +; AVX512VLDQ-NEXT: LBB16_4: ## %else2 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_6 +; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3 +; AVX512VLDQ-NEXT: vpextrb $2, %xmm1, 2(%rdi) +; AVX512VLDQ-NEXT: LBB16_6: ## %else4 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_8 +; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5 +; AVX512VLDQ-NEXT: vpextrb $3, %xmm1, 3(%rdi) +; AVX512VLDQ-NEXT: LBB16_8: ## %else6 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_10 +; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7 +; AVX512VLDQ-NEXT: vpextrb $4, %xmm1, 4(%rdi) +; AVX512VLDQ-NEXT: LBB16_10: ## %else8 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_12 +; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9 +; AVX512VLDQ-NEXT: vpextrb $5, %xmm1, 5(%rdi) +; AVX512VLDQ-NEXT: LBB16_12: ## %else10 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_14 +; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11 +; AVX512VLDQ-NEXT: vpextrb $6, %xmm1, 6(%rdi) +; AVX512VLDQ-NEXT: LBB16_14: ## %else12 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_16 +; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13 +; AVX512VLDQ-NEXT: vpextrb $7, %xmm1, 7(%rdi) +; AVX512VLDQ-NEXT: LBB16_16: ## %else14 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_18 +; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store15 +; AVX512VLDQ-NEXT: vpextrb $8, %xmm1, 8(%rdi) +; AVX512VLDQ-NEXT: LBB16_18: ## %else16 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_20 +; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store17 +; AVX512VLDQ-NEXT: vpextrb $9, %xmm1, 9(%rdi) +; AVX512VLDQ-NEXT: LBB16_20: ## %else18 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_22 +; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store19 +; AVX512VLDQ-NEXT: vpextrb $10, %xmm1, 10(%rdi) +; AVX512VLDQ-NEXT: LBB16_22: ## %else20 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_24 +; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store21 +; AVX512VLDQ-NEXT: vpextrb $11, %xmm1, 11(%rdi) +; AVX512VLDQ-NEXT: LBB16_24: ## %else22 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_26 +; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store23 +; AVX512VLDQ-NEXT: vpextrb $12, %xmm1, 12(%rdi) +; AVX512VLDQ-NEXT: LBB16_26: ## %else24 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_28 +; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store25 +; AVX512VLDQ-NEXT: vpextrb $13, %xmm1, 13(%rdi) +; AVX512VLDQ-NEXT: LBB16_28: ## %else26 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3 +; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_30 +; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store27 +; AVX512VLDQ-NEXT: vpextrb $14, %xmm1, 14(%rdi) +; AVX512VLDQ-NEXT: LBB16_30: ## %else28 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_32 +; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store29 +; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi) +; AVX512VLDQ-NEXT: LBB16_32: ## %else30 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_34 +; AVX512VLDQ-NEXT: ## %bb.33: ## %cond.store31 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $0, %xmm2, 16(%rdi) +; AVX512VLDQ-NEXT: LBB16_34: ## %else32 +; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_36 +; AVX512VLDQ-NEXT: ## %bb.35: ## %cond.store33 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $1, %xmm2, 17(%rdi) +; AVX512VLDQ-NEXT: LBB16_36: ## %else34 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_38 +; AVX512VLDQ-NEXT: ## %bb.37: ## %cond.store35 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $2, %xmm2, 18(%rdi) +; AVX512VLDQ-NEXT: LBB16_38: ## %else36 +; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_40 +; AVX512VLDQ-NEXT: ## %bb.39: ## %cond.store37 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $3, %xmm2, 19(%rdi) +; AVX512VLDQ-NEXT: LBB16_40: ## %else38 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_42 +; AVX512VLDQ-NEXT: ## %bb.41: ## %cond.store39 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $4, %xmm2, 20(%rdi) +; AVX512VLDQ-NEXT: LBB16_42: ## %else40 +; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_44 +; AVX512VLDQ-NEXT: ## %bb.43: ## %cond.store41 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $5, %xmm2, 21(%rdi) +; AVX512VLDQ-NEXT: LBB16_44: ## %else42 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_46 +; AVX512VLDQ-NEXT: ## %bb.45: ## %cond.store43 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $6, %xmm2, 22(%rdi) +; AVX512VLDQ-NEXT: LBB16_46: ## %else44 +; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_48 +; AVX512VLDQ-NEXT: ## %bb.47: ## %cond.store45 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $7, %xmm2, 23(%rdi) +; AVX512VLDQ-NEXT: LBB16_48: ## %else46 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_50 +; AVX512VLDQ-NEXT: ## %bb.49: ## %cond.store47 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $8, %xmm2, 24(%rdi) +; AVX512VLDQ-NEXT: LBB16_50: ## %else48 +; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_52 +; AVX512VLDQ-NEXT: ## %bb.51: ## %cond.store49 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $9, %xmm2, 25(%rdi) +; AVX512VLDQ-NEXT: LBB16_52: ## %else50 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_54 +; AVX512VLDQ-NEXT: ## %bb.53: ## %cond.store51 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $10, %xmm2, 26(%rdi) +; AVX512VLDQ-NEXT: LBB16_54: ## %else52 +; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_56 +; AVX512VLDQ-NEXT: ## %bb.55: ## %cond.store53 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $11, %xmm2, 27(%rdi) +; AVX512VLDQ-NEXT: LBB16_56: ## %else54 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0 +; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_58 +; AVX512VLDQ-NEXT: ## %bb.57: ## %cond.store55 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $12, %xmm2, 28(%rdi) +; AVX512VLDQ-NEXT: LBB16_58: ## %else56 +; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_60 +; AVX512VLDQ-NEXT: ## %bb.59: ## %cond.store57 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512VLDQ-NEXT: vpextrb $13, %xmm2, 29(%rdi) +; AVX512VLDQ-NEXT: LBB16_60: ## %else58 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1 +; AVX512VLDQ-NEXT: kmovw %k1, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_62 +; AVX512VLDQ-NEXT: ## %bb.61: ## %cond.store59 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpextrb $14, %xmm0, 30(%rdi) +; AVX512VLDQ-NEXT: LBB16_62: ## %else60 +; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %k0, %eax +; AVX512VLDQ-NEXT: testb $1, %al +; AVX512VLDQ-NEXT: je LBB16_64 +; AVX512VLDQ-NEXT: ## %bb.63: ## %cond.store61 +; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0 +; AVX512VLDQ-NEXT: vpextrb $15, %xmm0, 31(%rdi) +; AVX512VLDQ-NEXT: LBB16_64: ## %else62 +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: store_v32i8_v32i8: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k1 @@ -4293,11 +5056,11 @@ define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: mstore_constmask_v4i32_v4i32: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: mstore_constmask_v4i32_v4i32: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: kxnorw %k0, %k0, %k1 +; AVX512VL-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1} +; AVX512VL-NEXT: retq %mask = icmp eq <4 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1>) ret void @@ -4491,6 +5254,14 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x doub ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: masked_store_bool_mask_demand_trunc_sext: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1 +; AVX512VLDQ-NEXT: vmovupd %ymm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: vzeroupper +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: masked_store_bool_mask_demand_trunc_sext: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1 @@ -4606,11 +5377,11 @@ define void @one_mask_bit_set1_variable(<4 x float>* %addr, <4 x float> %val, <4 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; -; AVX512VLBW-LABEL: one_mask_bit_set1_variable: -; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm1, %k1 -; AVX512VLBW-NEXT: vmovups %xmm0, (%rdi) {%k1} -; AVX512VLBW-NEXT: retq +; AVX512VL-LABEL: one_mask_bit_set1_variable: +; AVX512VL: ## %bb.0: +; AVX512VL-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm1, %k1 +; AVX512VL-NEXT: vmovups %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: retq %mask_signbit = and <4 x i32> %mask, %mask_bool = icmp ne <4 x i32> %mask_signbit, zeroinitializer call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 1, <4 x i1> %mask_bool) @@ -4708,6 +5479,17 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VLDQ-LABEL: widen_masked_store: +; AVX512VLDQ: ## %bb.0: +; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k0 +; AVX512VLDQ-NEXT: vpmovm2d %k0, %xmm1 +; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLDQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3] +; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1 +; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} +; AVX512VLDQ-NEXT: retq +; ; AVX512VLBW-LABEL: widen_masked_store: ; AVX512VLBW: ## %bb.0: ; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1 -- 2.7.4