From e14474a39a14b3c86c6c5d5ed9bf11467a0bbe9b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 5 Aug 2020 23:21:16 -0400 Subject: [PATCH] AMDGPU/GlobalISel: Select llvm.amdgcn.global.atomic.fadd Remove the intermediate transform in the DAG path. I believe this is the last non-deprecated intrinsic that needs handling. --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 1 + llvm/lib/Target/AMDGPU/BUFInstructions.td | 2 +- llvm/lib/Target/AMDGPU/FLATInstructions.td | 6 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 19 +--- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 13 +-- .../GlobalISel/llvm.amdgcn.global.atomic.fadd.ll | 101 +++++++++++++++++++++ 6 files changed, 113 insertions(+), 29 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index ae90c55..eb41e56 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4245,6 +4245,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_csub: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 2e30476..45eca4b 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1099,7 +1099,7 @@ defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN < "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret >; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret >; } // End SubtargetPredicate = HasAtomicFaddInsts diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index f99354b..3ee01d5 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -717,7 +717,7 @@ defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret >; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret + "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret >; } // End SubtargetPredicate = HasAtomicFaddInsts @@ -784,7 +784,7 @@ class FlatAtomicPat : GCNPat < (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), - (inst $vaddr, $data, $offset, $slc) + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset, $slc) >; class FlatSignedAtomicPat ; def : FlatAtomicPatNoRtn ; -def : FlatAtomicPatNoRtn ; +def : FlatAtomicPatNoRtn ; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ddb84b4..b744091 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1140,6 +1140,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, ->getPointerElementType()); Info.ptrVal = CI.getOperand(0); Info.align.reset(); + + // FIXME: Should report an atomic ordering here. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; @@ -7521,21 +7523,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op->getVTList(), Ops, VT, M->getMemOperand()); } - - case Intrinsic::amdgcn_global_atomic_fadd: { - SDValue Ops[] = { - Chain, - Op.getOperand(2), // ptr - Op.getOperand(3) // vdata - }; - - EVT VT = Op.getOperand(3).getValueType(); - auto *M = cast(Op); - - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()).getValue(1); - } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); @@ -8567,7 +8554,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset /// by the chain and intrinsic ID. Theoretically we would also need to check the -/// specific intrinsic. +/// specific intrinsic, but they all place the pointer operand first. static unsigned getBasePtrIndex(const MemSDNode *N) { switch (N->getOpcode()) { case ISD::STORE: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index cee2d94..618b0a1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -316,7 +316,7 @@ defm atomic_load_fmax_#as : binary_atomic_op; } // End let AddressSpaces = ... } // End foreach AddrSpace -def atomic_fadd_global_noret : PatFrag< +def atomic_fadd_global_noret_impl : PatFrag< (ops node:$ptr, node:$value), (atomic_load_fadd node:$ptr, node:$value)> { // FIXME: Move this @@ -325,14 +325,9 @@ def atomic_fadd_global_noret : PatFrag< let AddressSpaces = StoreAddress_global.AddrSpaces; } -def atomic_pk_fadd_global_noret : PatFrag< - (ops node:$ptr, node:$value), - (atomic_load_fadd node:$ptr, node:$value)> { - // FIXME: Move this - let MemoryVT = v2f16; - let IsAtomic = 1; - let AddressSpaces = StoreAddress_global.AddrSpaces; -} +def atomic_fadd_global_noret : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_global_atomic_fadd node:$src0, node:$src1), + (atomic_fadd_global_noret_impl node:$src0, node:$src1)]>; //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll new file mode 100644 index 0000000..60ba088 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s + +define void @global_atomic_fadd_f32(float addrspace(1)* %ptr, float %data) { +; GFX908-LABEL: global_atomic_fadd_f32: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) + ret void +} + +define void @global_atomic_fadd_f32_off_2048(float addrspace(1)* %ptr, float %data) { +; GFX908-LABEL: global_atomic_fadd_f32_off_2048: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_movk_i32 s4, 0x800 +; GFX908-NEXT: s_mov_b32 s5, 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: v_mov_b32_e32 v4, s5 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + ret void +} + +define void @global_atomic_fadd_f32_off_neg2047(float addrspace(1)* %ptr, float %data) { +; GFX908-LABEL: global_atomic_fadd_f32_off_neg2047: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s4, 0xfffff804 +; GFX908-NEXT: s_mov_b32 s5, -1 +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: v_mov_b32_e32 v4, s5 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, float addrspace(1)* %ptr, i64 -511 + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %ptr, float %data) { +; GFX908-LABEL: global_atomic_fadd_f32_off_ss: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_add_u32 s0, s0, 0x800 +; GFX908-NEXT: s_addc_u32 s1, s1, 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s0 +; GFX908-NEXT: v_mov_b32_e32 v1, s1 +; GFX908-NEXT: v_mov_b32_e32 v2, s2 +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: s_endpgm + %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + ret void +} + +define void @global_atomic_fadd_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { +; GFX908-LABEL: global_atomic_fadd_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) + ret void +} + +define void @global_atomic_fadd_v2f16_off_neg2047(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { +; GFX908-LABEL: global_atomic_fadd_v2f16_off_neg2047: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s4, 0xfffff804 +; GFX908-NEXT: s_mov_b32 s5, -1 +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: v_mov_b32_e32 v4, s5 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511 + call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data) + ret void +} + +declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #0 +declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 + +attributes #0 = { argmemonly nounwind willreturn } -- 2.7.4