SDValue PassThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
+ unsigned RC = 0;
+ unsigned Opc = IntrData->Opc0;
+ bool SAE = Src.getValueType().is512BitVector() &&
+ (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
+ if (SAE) {
+ Opc = X86ISD::CVTPS2PH_SAE;
+ Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
+ }
+
if (isAllOnesConstant(Mask))
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
+ return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
+ if (SAE)
+ Opc = X86ISD::MCVTPS2PH_SAE;
+ else
+ Opc = IntrData->Opc1;
MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
- PassThru, Mask);
-
+ return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
}
case CVTNEPS2BF16_MASK: {
SDValue Src = Op.getOperand(1);
NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
NODE_NAME_CASE(CVTPS2PH)
NODE_NAME_CASE(STRICT_CVTPS2PH)
+ NODE_NAME_CASE(CVTPS2PH_SAE)
NODE_NAME_CASE(MCVTPS2PH)
+ NODE_NAME_CASE(MCVTPS2PH_SAE)
NODE_NAME_CASE(CVTPH2PS)
NODE_NAME_CASE(STRICT_CVTPH2PS)
NODE_NAME_CASE(CVTPH2PS_SAE)
multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
SchedWrite Sched> {
- let hasSideEffects = 0, Uses = [MXCSR] in
- defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
- (outs _dest.RC:$dst),
- (ins _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", []>,
- EVEX_B, AVX512AIi8Base, Sched<[Sched]>;
+ let hasSideEffects = 0, Uses = [MXCSR] in {
+ def rrb : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
+ (ins _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, {sae}, $src1, $dst|$dst, $src1, {sae}, $src2}",
+ [(set _dest.RC:$dst,
+ (X86cvtps2phSAE (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
+ EVEX_B, Sched<[Sched]>;
+ let Constraints = "$src0 = $dst" in
+ def rrbk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
+ (ins _dest.RC:$src0, _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, {sae}, $src1, $dst {${mask}}|$dst {${mask}}, $src1, {sae}, $src2}",
+ [(set _dest.RC:$dst,
+ (X86mcvtps2phSAE (_src.VT _src.RC:$src1), (i32 timm:$src2),
+ _dest.RC:$src0, _src.KRCWM:$mask))]>,
+ EVEX_B, Sched<[Sched]>, EVEX_K;
+ def rrbkz : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
+ (ins _src.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, {sae}, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, {sae}, $src2}",
+ [(set _dest.RC:$dst,
+ (X86mcvtps2phSAE (_src.VT _src.RC:$src1), (i32 timm:$src2),
+ _dest.ImmAllZerosV, _src.KRCWM:$mask))]>,
+ EVEX_B, Sched<[Sched]>, EVEX_KZ;
+}
}
let Predicates = [HasAVX512] in {
; X64-LABEL: test_x86_vcvtps2ph_256:
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
-; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
-; X64-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
+; X64-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z}
+; X64-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1}
; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X64-NEXT: vcvtps2ph $2, %zmm0, (%rsi)
; X64-NEXT: vmovdqa %ymm1, %ymm0
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
-; X86-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
+; X86-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z}
+; X86-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1}
; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1
; X86-NEXT: vcvtps2ph $2, %zmm0, (%eax)
; X86-NEXT: vmovdqa %ymm1, %ymm0
; X86-NEXT: retl
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask)
- %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> %src, i16 %mask)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 12, <16 x i16> %src, i16 %mask)
store <16 x i16> %res1, ptr %dst
%res = add <16 x i16> %res2, %res3
ret <16 x i16> %res
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
-; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02]
-; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
+; X86-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a]
+; X86-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b]
; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02]
-; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x02]
-; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
+; X64-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a]
+; X64-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b]
; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask)
- %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> %src, i8 %mask)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 11, <8 x i16> %src, i8 %mask)
%res0 = add <8 x i16> %res1, %res2
%res = add <8 x i16> %res3, %res0
ret <8 x i16> %res
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
-; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02]
-; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
+; X86-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b]
+; X86-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c]
; X86-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02]
-; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x02]
-; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
+; X64-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b]
+; X64-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c]
; X64-NEXT: vpaddw %xmm1, %xmm3, %xmm0 # encoding: [0xc5,0xe1,0xfd,0xc1]
; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 %mask)
- %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> %src, i8 %mask)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 11, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 12, <8 x i16> %src, i8 %mask)
%res0 = add <8 x i16> %res1, %res2
%res = add <8 x i16> %res3, %res0
ret <8 x i16> %res