From 7f7ef0208b57a4d253cd8b07053460f40ad7cbc8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 7 Aug 2019 21:16:10 +0000 Subject: [PATCH] [X86] Allow pack instructions to be used for 512->256 truncates when -mprefer-vector-width=256 is causing 512-bit vectors to be split If we're splitting the 512-bit vector anyway and we have zero/sign bits, then we might as well use pack instructions to concat and truncate at once. Differential Revision: https://reviews.llvm.org/D65904 llvm-svn: 368210 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +++++++-- llvm/test/CodeGen/X86/min-legal-vector-width.ll | 33 ++++++++++--------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e18b55b..368f409 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40898,8 +40898,8 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - // Requires SSE2 but AVX512 has fast truncate. - if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) + // Requires SSE2. + if (!Subtarget.hasSSE2()) return SDValue(); if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple()) @@ -40923,6 +40923,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); + // AVX512 has fast truncate, but if the input is already going to be split, + // there's no harm in trying pack. + if (Subtarget.hasAVX512() && + !(!Subtarget.useAVX512Regs() && VT.is256BitVector() && + InVT.is512BitVector())) + return SDValue(); + unsigned NumPackedSignBits = std::min(SVT.getSizeInBits(), 16); unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index b75fd2d8..40d557a 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -757,9 +757,8 @@ define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-ve ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1 -; CHECK-NEXT: vpmovqd %ymm1, %xmm1 -; CHECK-NEXT: vpmovqd %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = lshr <8 x i64> %a, @@ -770,11 +769,9 @@ define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-ve define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v16i32_v16i16_zeroes: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsrld $16, 32(%rdi), %ymm0 -; CHECK-NEXT: vpsrld $16, (%rdi), %ymm1 -; CHECK-NEXT: vpmovdw %ymm1, %xmm1 -; CHECK-NEXT: vpmovdw %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] +; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = lshr <16 x i32> %a, @@ -787,9 +784,8 @@ define <32 x i8> @trunc_v32i16_v32i8_zeroes(<32 x i16>* %x) nounwind "min-legal- ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrlw $8, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsrlw $8, (%rdi), %ymm1 -; CHECK-NEXT: vpmovwb %ymm1, %xmm1 -; CHECK-NEXT: vpmovwb %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <32 x i16>, <32 x i16>* %x %b = lshr <32 x i16> %a, @@ -802,9 +798,8 @@ define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vect ; CHECK: # %bb.0: ; CHECK-NEXT: vpsraq $48, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsraq $48, (%rdi), %ymm1 -; CHECK-NEXT: vpmovqd %ymm1, %xmm1 -; CHECK-NEXT: vpmovqd %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = ashr <8 x i64> %a, @@ -817,9 +812,8 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal- ; CHECK: # %bb.0: ; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1 -; CHECK-NEXT: vpmovdw %ymm1, %xmm1 -; CHECK-NEXT: vpmovdw %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = ashr <16 x i32> %a, @@ -832,9 +826,8 @@ define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-ve ; CHECK: # %bb.0: ; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0 ; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1 -; CHECK-NEXT: vpmovwb %ymm1, %xmm1 -; CHECK-NEXT: vpmovwb %ymm0, %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <32 x i16>, <32 x i16>* %x %b = ashr <32 x i16> %a, -- 2.7.4