From 33b2cf50e32d96faa17cbd3ecbe16b6f9480dd5b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 12 Jan 2019 08:05:12 +0000 Subject: [PATCH] [X86] Add ISD node for masked version of CVTPS2PH. The 128-bit input produces 64-bits of output and fills the upper 64-bits with 0. The mask only applies to the lower elements. But we can't represent this with a vselect like we normally do. This also avoids the need to have a special X86ISD::SELECT when avx512bw isn't enabled since vselect v8i16 isn't legal there. Fixes another instruction for PR34877. llvm-svn: 350994 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 27 +++++++++++++++-------- llvm/lib/Target/X86/X86ISelLowering.h | 4 ++++ llvm/lib/Target/X86/X86InstrAVX512.td | 29 +++++++++++++++++++------ llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 7 ++++++ llvm/lib/Target/X86/X86IntrinsicsInfo.h | 14 ++++++------ llvm/test/CodeGen/X86/avx512-intrinsics.ll | 2 +- llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 32 ++++++++++++++-------------- 7 files changed, 76 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b9c5620..c896e36 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -21383,12 +21383,6 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, case X86ISD::VPSHUFBITQMB: case X86ISD::VFPCLASS: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); - case X86ISD::CVTPS2PH: - // We can't use ISD::VSELECT here because it is not always "Legal" - // for the destination type. For example vpmovqb require only AVX512 - // and vselect that can operate on byte element type require BWI - OpcodeSelect = X86ISD::SELECT; - break; } if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); @@ -22068,9 +22062,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Mask = Op.getOperand(3); if (isAllOnesConstant(Mask)) - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, - Op.getValueType(), Src), - Mask, PassThru, Subtarget, DAG); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); MVT SrcVT = Src.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); @@ -22078,6 +22070,22 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, Mask); } + case CVTPS2PH_MASK: { + SDValue Src = Op.getOperand(1); + SDValue Rnd = Op.getOperand(2); + SDValue PassThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + + if (isAllOnesConstant(Mask)) + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd); + + MVT SrcVT = Src.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); + Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd, + PassThru, Mask); + + } default: break; } @@ -27365,6 +27373,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND"; case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; + case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 7e01590..5aba90f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -556,6 +556,10 @@ namespace llvm { // Conversions between float and half-float. CVTPS2PH, CVTPH2PS, CVTPH2PS_RND, + // Masked version of above. + // SRC, RND, PASSTHRU, MASK + MCVTPS2PH, + // Galois Field Arithmetic Instructions GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB, diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index fe0eb28..14faaac 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -8726,12 +8726,28 @@ let Predicates = [HasVLX] in { multiclass avx512_cvtps2ph { - defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), - (ins _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph", "$src2, $src1", "$src1, $src2", - (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2)), 0, 0>, - AVX512AIi8Base, Sched<[RR]>; +let ExeDomain = GenericDomain in { + def rr : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _dest.RC:$dst, + (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)))]>, + Sched<[RR]>; + let Constraints = "$src0 = $dst" in + def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), + (ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _dest.RC:$dst, + (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), + _dest.RC:$src0, _src.KRCWM:$mask))]>, + Sched<[RR]>, EVEX_K; + def rrkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), + (ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}", + [(set _dest.RC:$dst, + (X86mcvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2), + _dest.ImmAllZerosV, _src.KRCWM:$mask))]>, + Sched<[RR]>, EVEX_KZ; let hasSideEffects = 0, mayStore = 1 in { def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), @@ -8743,6 +8759,7 @@ multiclass avx512_cvtps2ph, NotMemoryFoldable; } } +} multiclass avx512_cvtps2ph_sae { diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index d806472..e2746a0 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -595,6 +595,13 @@ def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, SDTCVecEltisVT<1, f32>, SDTCisVT<2, i32>]> >; +def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH", + SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisVT<2, i32>, + SDTCisSameAs<0, 3>, + SDTCVecEltisVT<4, i1>, + SDTCisSameNumEltsAs<1, 4>]> >; def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, SDTCVecEltisVT<1, f32>, diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 1c811a6..3e260ca 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -32,7 +32,7 @@ enum IntrinsicType : uint16_t { IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK, COMPRESS_EXPAND_IN_REG, - TRUNCATE_TO_REG, + TRUNCATE_TO_REG, CVTPS2PH_MASK, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, FIXUPIMMS_MASKZ, GATHER_AVX2, @@ -838,12 +838,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK, X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND), - X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK, - X86ISD::CVTPS2PH, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK, - X86ISD::CVTPS2PH, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK, - X86ISD::CVTPS2PH, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK, + X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, CVTPS2PH_MASK, + X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, CVTPS2PH_MASK, + X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH), X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_128, CMP_MASK, X86ISD::VPSHUFBITQMB, 0), diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 9fec17d..2e83817 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -990,8 +990,8 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; CHECK-LABEL: test_x86_vcvtps2ph_256: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) ; CHECK-NEXT: vmovdqa %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 13d08d0..912715b 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4306,21 +4306,21 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] +; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02] ; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] -; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02] -; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] -; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02] -; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] +; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] +; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02] ; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02] -; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02] -; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] -; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x02] -; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] +; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask) @@ -4337,22 +4337,22 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] +; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02] ; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] -; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02] -; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] -; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02] -; X86-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] +; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] +; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02] ; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02] -; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02] -; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] -; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x02] -; X64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1] +; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) -- 2.7.4