From ccc85ac85569c55b6915180d8b722812b4a226b6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 11 Oct 2019 04:16:49 +0000 Subject: [PATCH] [X86] Add a DAG combine to turn v16i16->v16i8 VTRUNCUS+store into a saturating truncating store. llvm-svn: 374509 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +++++++++++++ llvm/test/CodeGen/X86/min-legal-vector-width.ll | 3 +-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 275e876..0e11941 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40448,6 +40448,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, MVT::v16i8, St->getMemOperand()); } + // Try to fold a vpmovuswb 256->128 into a truncating store. + // FIXME: Generalize this to other types. + // FIXME: Do the same for signed saturation. + if (!St->isTruncatingStore() && VT == MVT::v16i8 && + St->getValue().getOpcode() == X86ISD::VTRUNCUS && + St->getValue().getOperand(0).getValueType() == MVT::v16i16 && + TLI.isTruncStoreLegal(MVT::v16i16, MVT::v16i8) && + St->getValue().hasOneUse()) { + return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), + dl, St->getValue().getOperand(0), St->getBasePtr(), + MVT::v16i8, St->getMemOperand(), DAG); + } + // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 95e6d3e..eb90a2a 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1104,8 +1104,7 @@ define void @trunc_packus_v16i32_v16i8_store(<16 x i32>* %p, <16 x i8>* %q) "min ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 -; CHECK-NEXT: vmovdqa %xmm0, (%rsi) +; CHECK-NEXT: vpmovuswb %ymm0, (%rsi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %p -- 2.7.4