From cd0dfc93ebb4b909b58d0afa5630a7f8c25ff509 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 3 Apr 2016 18:22:03 +0000 Subject: [PATCH] [X86][SSE] Support for MOVMSK signbit extraction instructions Add support for lowering with the MOVMSK instruction to extract vector element signbits to a GPR. This is an early step towards more optimal handling of vector comparison results. Differential Revision: http://reviews.llvm.org/D18741 llvm-svn: 265266 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 31 ++++--------------------- llvm/lib/Target/X86/X86ISelLowering.h | 3 +++ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 3 +++ llvm/lib/Target/X86/X86InstrSSE.td | 34 +++++++++++++--------------- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 6 +++++ 5 files changed, 32 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0ceb61c..2d7c89d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -21888,6 +21888,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::AND: return "X86ISD::AND"; case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; + case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; case X86ISD::PTEST: return "X86ISD::PTEST"; case X86ISD::TESTP: return "X86ISD::TESTP"; case X86ISD::TESTM: return "X86ISD::TESTM"; @@ -24018,33 +24019,9 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, case X86ISD::SETCC: KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); break; - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntId = cast(Op.getOperand(0))->getZExtValue(); - unsigned NumLoBits = 0; - switch (IntId) { - default: break; - case Intrinsic::x86_sse_movmsk_ps: - case Intrinsic::x86_avx_movmsk_ps_256: - case Intrinsic::x86_sse2_movmsk_pd: - case Intrinsic::x86_avx_movmsk_pd_256: - case Intrinsic::x86_mmx_pmovmskb: - case Intrinsic::x86_sse2_pmovmskb_128: - case Intrinsic::x86_avx2_pmovmskb: { - // High bits of movmskp{s|d}, pmovmskb are known zero. - switch (IntId) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse_movmsk_ps: NumLoBits = 4; break; - case Intrinsic::x86_avx_movmsk_ps_256: NumLoBits = 8; break; - case Intrinsic::x86_sse2_movmsk_pd: NumLoBits = 2; break; - case Intrinsic::x86_avx_movmsk_pd_256: NumLoBits = 4; break; - case Intrinsic::x86_mmx_pmovmskb: NumLoBits = 8; break; - case Intrinsic::x86_sse2_pmovmskb_128: NumLoBits = 16; break; - case Intrinsic::x86_avx2_pmovmskb: NumLoBits = 32; break; - } - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); - break; - } - } + case X86ISD::MOVMSK: { + unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); + KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits); break; } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 89d5241..664eadd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -352,6 +352,9 @@ namespace llvm { // X86-specific multiply by immediate. MUL_IMM, + // Vector sign bit extraction. + MOVMSK, + // Vector bitwise comparisons. PTEST, diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index a5adfd3..2c75ad3 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -276,6 +276,9 @@ def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; def X86testm : SDNode<"X86ISD::TESTM", SDTX86Testm, [SDNPCommutative]>; def X86testnm : SDNode<"X86ISD::TESTNM", SDTX86Testm, [SDNPCommutative]>; +def X86movmsk : SDNode<"X86ISD::MOVMSK", + SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>; + def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 70bcc2b..43289d9 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2765,25 +2765,23 @@ let Predicates = [HasAVX1Only] in { //===----------------------------------------------------------------------===// /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave -multiclass sse12_extr_sign_mask { +multiclass sse12_extr_sign_mask { def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>, + [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>, Sched<[WriteVecLogic]>; } let Predicates = [HasAVX] in { - defm VMOVMSKPS : sse12_extr_sign_mask, PS, VEX; - defm VMOVMSKPD : sse12_extr_sign_mask, PD, VEX; - defm VMOVMSKPSY : sse12_extr_sign_mask, PS, - VEX, VEX_L; - defm VMOVMSKPDY : sse12_extr_sign_mask, PD, - VEX, VEX_L; + defm VMOVMSKPS : sse12_extr_sign_mask, PS, VEX; + defm VMOVMSKPD : sse12_extr_sign_mask, PD, VEX; + defm VMOVMSKPSY : sse12_extr_sign_mask, PS, VEX, VEX_L; + defm VMOVMSKPDY : sse12_extr_sign_mask, PD, VEX, VEX_L; def : Pat<(i32 (X86fgetsign FR32:$src)), (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -2797,9 +2795,9 @@ let Predicates = [HasAVX] in { (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>; } -defm MOVMSKPS : sse12_extr_sign_mask, PS; -defm MOVMSKPD : sse12_extr_sign_mask, PD; def : Pat<(i32 (X86fgetsign FR32:$src)), @@ -4665,20 +4663,20 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))], IIC_SSE_MOVMSK>, VEX; let Predicates = [HasAVX2] in { def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR256:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, + [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, VEX, VEX_L; } def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))], + [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))], IIC_SSE_MOVMSK>; } // ExeDomain = SSEPackedInt diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 1c8ec14..d1e2625 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -321,6 +321,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), + X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0), X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0), @@ -354,6 +356,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), @@ -2184,6 +2187,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE), X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0), X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0), @@ -2201,6 +2205,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE), X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), @@ -2210,6 +2215,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), -- 2.7.4