From 4cf28bad5b106ba71c03798ba01223ae5942bc05 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 25 Jan 2019 18:37:36 +0000 Subject: [PATCH] [X86] Combine masked store and truncate into masked truncating stores. We also need to combine to masked truncating with saturation stores, but I'm leaving that for a future patch. This does regress some tests that used truncate wtih saturation followed by a masked store. Those now use a truncating store and use min/max to saturate. Differential Revision: https://reviews.llvm.org/D57218 llvm-svn: 352230 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 18 ++-- llvm/test/CodeGen/X86/masked_store_trunc.ll | 77 ++++++----------- llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll | 105 +++++++++++------------ llvm/test/CodeGen/X86/masked_store_trunc_usat.ll | 91 ++++++++------------ 4 files changed, 127 insertions(+), 164 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7f53919..d1870b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37701,6 +37701,10 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT VT = Mst->getValue().getValueType(); + EVT StVT = Mst->getMemoryVT(); + SDLoc dl(Mst); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!Mst->isTruncatingStore()) { if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) return ScalarStore; @@ -37709,7 +37713,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mst->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) return SDValue(N, 0); @@ -37719,20 +37722,25 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // pattern above, but that pattern will be different. It will either need to // match setcc more generally or match PCMPGTM later (in tablegen?). + SDValue Value = Mst->getValue(); + if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && + TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), + Mst->getMemoryVT())) { + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), + Mst->getBasePtr(), Mask, + Mst->getMemoryVT(), Mst->getMemOperand(), true); + } + return SDValue(); } // Resolve truncating stores. unsigned NumElems = VT.getVectorNumElements(); - EVT StVT = Mst->getMemoryVT(); - SDLoc dl(Mst); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getScalarSizeInBits(); unsigned ToSz = StVT.getScalarSizeInBits(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // The truncating store is legal in some cases. For example // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw // are designated for truncate store. diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index e2237bd..bbbd941 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -8,30 +8,23 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; AVX512F-LABEL: truncstore_v8i64_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512VL-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v8i64_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -114,19 +107,15 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftld $24, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpmovqw %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -209,20 +198,15 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpmovqb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -247,8 +231,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -325,8 +308,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BWVL-LABEL: truncstore_v4i64_v4i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -391,8 +373,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512BWVL-LABEL: truncstore_v4i64_v4i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -654,16 +635,14 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, <16 x i16>* %p, <16 x i32> ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BWVL-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -794,16 +773,14 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %p, <16 x i32> %m ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovdb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -899,8 +876,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -997,8 +973,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512BWVL-LABEL: truncstore_v8i32_v8i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1531,18 +1506,15 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %k1 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BWVL-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -1756,8 +1728,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, <16 x i8>* %p, <16 x i8> %ma ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index 68a2d09..2d25632 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -8,30 +8,29 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; AVX512F-LABEL: truncstore_v8i64_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512VL-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v8i64_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 -; AVX512BW-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -118,19 +117,19 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftld $24, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 -; AVX512BW-NEXT: vpmovsqw %zmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovsqw %zmm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -219,14 +218,10 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovqb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -235,8 +230,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -269,8 +263,9 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpmovsqd %ymm0, %xmm0 -; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmaxsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -365,8 +360,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 ; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmaxsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -445,8 +439,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 ; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmaxsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -760,16 +753,18 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, <16 x i16>* %p, <16 x i32> ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpmovsdw %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpmovsdw %zmm0, %ymm0 -; AVX512BWVL-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -904,16 +899,18 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %p, <16 x i32> %m ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpmovsdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpmovsdb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -1019,8 +1016,9 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovsdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1129,8 +1127,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1701,18 +1698,19 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %k1 -; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovswb %zmm0, %ymm0 -; AVX512BWVL-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmaxsw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -1933,8 +1931,9 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, <16 x i8>* %p, <16 x i8> %ma ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpmovswb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 6ea5749..8f07023 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -8,30 +8,26 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; AVX512F-LABEL: truncstore_v8i64_v8i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $8, %k0, %k0 -; AVX512F-NEXT: kshiftrw $8, %k0, %k1 -; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512VL-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512VL-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: truncstore_v8i64_v8i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftlw $8, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $8, %k0, %k1 -; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -116,19 +112,17 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftld $24, %k0, %k0 -; AVX512BW-NEXT: kshiftrd $24, %k0, %k1 -; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovqw %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v8i64_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovusqw %zmm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -214,13 +208,9 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 ; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: kshiftlq $56, %k0, %k0 -; AVX512BW-NEXT: kshiftrq $56, %k0, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpmovqb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -228,8 +218,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512BWVL-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -258,8 +247,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 -; AVX512VL-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} +; AVX512VL-NEXT: vpminuq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -345,8 +334,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 ; AVX512BWVL-NEXT: vpminuq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -418,8 +406,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1 ; AVX512BWVL-NEXT: vpminuq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer @@ -710,16 +697,16 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, <16 x i16>* %p, <16 x i32> ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpmovusdw %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpminud {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpmovusdw %zmm0, %ymm0 -; AVX512BWVL-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -852,16 +839,16 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %p, <16 x i32> %m ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vpminud {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v16i32_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BWVL-NEXT: vpmovusdb %zmm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i32> %mask, zeroinitializer @@ -961,8 +948,8 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX512BWVL-LABEL: truncstore_v8i32_v8i16: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1064,8 +1051,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, <8 x i8>* %p, <8 x i32> %mask) ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer @@ -1617,18 +1603,17 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kmovd %k0, %k1 -; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} +; AVX512BW-NEXT: vptestmb %zmm1, %zmm1, %k1 +; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: truncstore_v32i16_v32i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1 -; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 -; AVX512BWVL-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminuw {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <32 x i8> %mask, zeroinitializer @@ -1845,8 +1830,8 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, <16 x i8>* %p, <16 x i8> %ma ; AVX512BWVL-LABEL: truncstore_v16i16_v16i8: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1 -; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} +; AVX512BWVL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1} ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %a = icmp ne <16 x i8> %mask, zeroinitializer -- 2.7.4