From c48712b341f328c6095d50bde1947cc523b04014 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 12 Nov 2018 19:37:29 +0000 Subject: [PATCH] [X86] In LowerMULH, use generic truncate and vector shuffle nodes instead of directly emitting PACKUS. Truncate and shuffle lowering are already capable of matching to PACKUS using known bits analysis. This features one test change where we now prefer to extend v16i16->v16i32 then trunc v16i32->v16i8 over extract_subvector+packus when avx512f is available, but avx512bw is not. llvm-svn: 346697 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 31 ++++++++++++++++++------------- llvm/test/CodeGen/X86/combine-sdiv.ll | 4 ++-- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6275e68..16b3a6c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -23435,12 +23435,15 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG); Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG); - SDValue Res = DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); - // The ymm variant of PACKUS treats the 128-bit lanes separately, so we - // need to permute the final result into place. - Res = DAG.getBitcast(MVT::v4i64, Res); - Res = DAG.getVectorShuffle(MVT::v4i64, dl, Res, Res, { 0, 2, 1, 3 }); - return DAG.getBitcast(VT, Res); + // Bitcast back to VT and then pack all the even elements from Lo and Hi. + // Shuffle lowering should turn this into PACKUS+PERMQ + Lo = DAG.getBitcast(VT, Lo); + Hi = DAG.getBitcast(VT, Hi); + return DAG.getVectorShuffle(VT, dl, Lo, Hi, + { 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62}); } assert(VT == MVT::v16i8 && "Unexpected VT"); @@ -23450,12 +23453,7 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, MVT::v16i16, Mul, 8, DAG); - // If we have BWI we can use truncate instruction. - if (Subtarget.hasBWI()) - return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); - Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo); - Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi); - return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); } assert(VT == MVT::v16i8 && @@ -23506,7 +23504,14 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG); RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG); - return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); + + // Bitcast back to VT and then pack all the even elements from Lo and Hi. + // Shuffle lowering should turn this into PACKUS. + RLo = DAG.getBitcast(VT, RLo); + RHi = DAG.getBitcast(VT, RHi); + return DAG.getVectorShuffle(VT, dl, RLo, RHi, + { 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30}); } SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index a78ecd2..4685165 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -3169,8 +3169,8 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1 ; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -- 2.7.4