From e96d03745d55c18b9afa7f581de06297bf1153b0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 8 Dec 2016 20:14:46 +0000 Subject: [PATCH] AMDGPU: Make f16 ConstantFP legal Not having this legal led to combine failures, resulting in dumb things like bitcasts of constants not being folded away. The only reason I'm leaving the v_mov_b32 hack that f32 already uses is to avoid madak formation test regressions. PeepholeOptimizer has an ordering issue where the immediate fold attempt is into the sgpr->vgpr copy instead of the actual use. Running it twice avoids that problem. llvm-svn: 289096 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 14 +------------- llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 --- llvm/lib/Target/AMDGPU/SIInstructions.td | 13 +++++++++++++ llvm/test/CodeGen/AMDGPU/br_cc.f16.ll | 5 +++-- 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a0184bf..eeab482 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -277,7 +277,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); // F16 - Constant Actions. - setOperationAction(ISD::ConstantFP, MVT::f16, Custom); + setOperationAction(ISD::ConstantFP, MVT::f16, Legal); // F16 - Load/Store Actions. setOperationAction(ISD::LOAD, MVT::f16, Promote); @@ -1848,9 +1848,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); case ISD::TRAP: return lowerTRAP(Op, DAG); - - case ISD::ConstantFP: - return lowerConstantFP(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); } @@ -2055,15 +2052,6 @@ SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, DAG.getNode(ISD::FTRUNC, DL, VT, Op); } -SDValue SITargetLowering::lowerConstantFP(SDValue Op, SelectionDAG &DAG) const { - if (ConstantFPSDNode *FP = dyn_cast(Op)) { - return DAG.getConstant(FP->getValueAPF().bitcastToAPInt().getZExtValue(), - SDLoc(Op), MVT::i32); - } - - return SDValue(); -} - SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f16 && "Do not know how to custom lower FP_ROUND for non-f16 type"); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 56d6ef2..cb6d536 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -53,9 +53,6 @@ class SITargetLowering final : public AMDGPUTargetLowering { const SDLoc &DL, EVT VT) const; - /// \brief Custom lowering for ISD::ConstantFP. - SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const; - /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 7ca5096..93e7bcd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -706,12 +706,25 @@ def : Pat < (S_MOV_B32 imm:$imm) >; +// FIXME: Workaround for ordering issue with peephole optimizer where +// a register class copy interferes with immediate folding. Should +// use s_mov_b32, which can be shrunk to s_movk_i32 +def : Pat < + (VGPRImm<(f16 fpimm)>:$imm), + (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) +>; + def : Pat < (f32 fpimm:$imm), (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) >; def : Pat < + (f16 fpimm:$imm), + (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : Pat < (i32 frameindex:$fi), (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi))) >; diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index 9702604..b758471 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -47,8 +47,9 @@ two: ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]] -; VI: v_cmp_nle_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; SI: s_cbranch_vccz + +; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] ; VI: s_cbranch_vccnz ; VI: one{{$}} @@ -85,7 +86,7 @@ two: ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_nge_f16_e32 vcc, v[[B_F16]], v[[A_F16]] +; VI: v_cmp_ngt_f16_e32 vcc, v[[B_F16]], v[[A_F16]] ; GCN: s_cbranch_vccnz ; GCN: one{{$}} -- 2.7.4