From: Matt Arsenault Date: Sun, 29 Mar 2020 14:26:16 +0000 (-0400) Subject: AMDGPU: Form v_cvt_ubyte* with f16 results X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=db9f0d1ce5819b350293ff272277329e327215be;p=platform%2Fupstream%2Fllvm.git AMDGPU: Form v_cvt_ubyte* with f16 results We get 2 conversion instructions anyway. Previously we would get a conversion with SDWA reading from a byte source, which has a larger encoding. --- diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c139133..caada91 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7978,7 +7978,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); EVT ScalarVT = VT.getScalarType(); - if (ScalarVT != MVT::f32) + if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -7993,8 +7993,14 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, // about in practice. if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { - SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src); DCI.AddToWorklist(Cvt.getNode()); + + // For the f16 case, fold to a cast to f32 and then cast back to f16. + if (ScalarVT != MVT::f32) { + Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt, + DAG.getTargetConstant(0, DL, MVT::i32)); + } return Cvt; } } diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 15d88b5..c3f213b 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -202,7 +202,7 @@ define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind { ; VI-LABEL: v_uitofp_i32_to_f16_mask255: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 @@ -222,7 +222,7 @@ define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind { ; VI-LABEL: v_sitofp_i32_to_f16_mask255: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %masked = and i32 %arg0, 255 @@ -242,7 +242,7 @@ define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind { ; VI-LABEL: v_uitofp_to_f16_lshr8_mask255: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %lshr.8 = lshr i32 %arg0, 8 @@ -263,7 +263,7 @@ define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind { ; VI-LABEL: v_uitofp_to_f16_lshr16_mask255: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 16 @@ -284,7 +284,7 @@ define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind { ; VI-LABEL: v_uitofp_to_f16_lshr24_mask255: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %lshr.16 = lshr i32 %arg0, 24