From 354a43c7bc562a4c67439a02e4bd717fb49a5857 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Fri, 1 Apr 2016 18:27:37 +0000 Subject: [PATCH] AMDGPU: Implement {BUFFER,FLAT}_ATOMIC_CMPSWAP{,_X2} MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Summary: Implement BUFFER_ATOMIC_CMPSWAP{,_X2} instructions on all GCN targets, and FLAT_ATOMIC_CMPSWAP{,_X2} on CI+. 32-bit instruction variants tested manually on Kabini and Bonaire. Tests and parts of code provided by Jan Veselý. Patch by: Vedran Miletić Reviewers: arsenm, tstellarAMD, nhaehnle Subscribers: jvesely, scchan, kanarayan, arsenm Differential Revision: http://reviews.llvm.org/D17280 llvm-svn: 265170 --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td | 5 ++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 7 +++ llvm/lib/Target/AMDGPU/CIInstructions.td | 8 ++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 62 +++++++++++++++++++ llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 + llvm/lib/Target/AMDGPU/SIInstructions.td | 35 ++++++++++- llvm/test/CodeGen/AMDGPU/global_atomics.ll | 89 +++++++++++++++++++++++++++ 9 files changed, 206 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 855455e..f66593b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2812,6 +2812,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(ATOMIC_CMP_SWAP) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 4627f34..70902dc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -313,6 +313,7 @@ enum NodeType : unsigned { STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + ATOMIC_CMP_SWAP, LAST_AMDGPU_ISD_NUMBER }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 1993241..637d56a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -183,6 +183,11 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", SDTypeProfile<0, 2, []>, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + def AMDGPUround : SDNode<"ISD::FROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index d9e3ab4..56a45ce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -400,6 +400,13 @@ def atomic_umax_global : global_binary_atomic_op; def atomic_umin_global : global_binary_atomic_op; def atomic_xor_global : global_binary_atomic_op; +def atomic_cmp_swap_global : global_binary_atomic_op; +def atomic_cmp_swap_global_nortn : PatFrag< + (ops node:$ptr, node:$value), + (atomic_cmp_swap_global node:$ptr, node:$value), + [{ return SDValue(N, 0).use_empty(); }] +>; + //===----------------------------------------------------------------------===// // Misc Pattern Fragments //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/CIInstructions.td b/llvm/lib/Target/AMDGPU/CIInstructions.td index 593300f..c1b05fa 100644 --- a/llvm/lib/Target/AMDGPU/CIInstructions.td +++ b/llvm/lib/Target/AMDGPU/CIInstructions.td @@ -308,8 +308,9 @@ def : FlatStorePat ; def : FlatStorePat ; def : FlatStorePat ; -class FlatAtomicPat : Pat < - (vt (node i64:$addr, vt:$data)), +class FlatAtomicPat : Pat < + (vt (node i64:$addr, data_vt:$data)), (inst $addr, $data, 0, 0) >; @@ -322,6 +323,9 @@ def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; + } // End Predicates = [isCIVI] diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a3911f8..25ae818 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -257,6 +257,16 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, + // and output demarshalling + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); + + // We can't return success/failure, only the old value, + // let LLVM add the comparison + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -1156,6 +1166,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { MachineFunction &MF = DAG.getMachineFunction(); @@ -2003,6 +2014,34 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { } } +SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { + AtomicSDNode *AtomicNode = cast(Op); + assert(AtomicNode->isCompareAndSwap()); + unsigned AS = AtomicNode->getAddressSpace(); + + // No custom lowering required for local address space + if (!isFlatGlobalAddrSpace(AS)) + return Op; + + // Non-local address space requires custom lowering for atomic compare + // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 + SDLoc DL(Op); + SDValue ChainIn = Op.getOperand(0); + SDValue Addr = Op.getOperand(1); + SDValue Old = Op.getOperand(2); + SDValue New = Op.getOperand(3); + EVT VT = Op.getValueType(); + MVT SimpleVT = VT.getSimpleVT(); + MVT VecType = MVT::getVectorVT(SimpleVT, 2); + + SDValue NewOld = DAG.getNode(ISD::BUILD_VECTOR, DL, VecType, + New, Old); + SDValue Ops[] = { ChainIn, Addr, NewOld }; + SDVTList VTList = DAG.getVTList(VT, MVT::Other); + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, + VTList, Ops, VT, AtomicNode->getMemOperand()); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// @@ -2849,8 +2888,31 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, if (!Node->hasAnyUseOfValue(0)) { MI->setDesc(TII->get(NoRetAtomicOp)); MI->RemoveOperand(0); + return; } + // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg + // instruction, because the return type of these instructions is a vec2 of + // the memory type, so it can be tied to the input operand. + // This means these instructions always have a use, so we need to add a + // special case to check if the atomic has only one extract_subreg use, + // which itself has no uses. + if ((Node->hasNUsesOfValue(1, 0) && + Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && + !Node->use_begin()->hasAnyUseOfValue(0))) { + unsigned Def = MI->getOperand(0).getReg(); + + // Change this into a noret atomic. + MI->setDesc(TII->get(NoRetAtomicOp)); + MI->RemoveOperand(0); + + // If we only remove the def operand from the atomic instruction, the + // extract_subreg will be left with a use of a vreg without a def. + // So we need to insert an implicit_def to avoid machine verifier + // errors. + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Def); + } return; } } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 842fbeb..34b9e07 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -41,6 +41,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index bc0afa0..8687b2a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1048,7 +1048,9 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < //def BUFFER_ATOMIC_FMIN : MUBUF_ , "buffer_atomic_fmin", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX : MUBUF_ , "buffer_atomic_fmax", []>; // isn't on VI //def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 , "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_cmpswap_x2", []>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic < + mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag +>; //def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 , "buffer_atomic_add_x2", []>; //def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 , "buffer_atomic_sub_x2", []>; //def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI @@ -3186,6 +3188,37 @@ def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; + +multiclass MUBUFCmpSwapPat { + +let Predicates = [isSI] in { + def : Pat < + (node_vt (node (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), data_vt:$vdata_in)), + (EXTRACT_SUBREG + (inst_addr64 $vdata_in, $vaddr, $srsrc, $soffset, $offset, $slc), sub0) + >; + +} + + def : Pat < + (node_vt (node (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, + i1:$slc), data_vt:$vdata_in)), + (EXTRACT_SUBREG + (inst_offset $vdata_in, $srsrc, $soffset, $offset, $slc), sub0) + >; +} + +defm : MUBUFCmpSwapPat ; + +defm : MUBUFCmpSwapPat ; + //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index a92ee89..83a8d02 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -758,6 +758,95 @@ entry: ret void } +; CMP_SWAP + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset: +; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset: +; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset: +; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset: +; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32: +; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret: +; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +entry: + %0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64: +; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} +define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64: +; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + ; FUNC-LABEL: {{^}}atomic_xor_i32_offset: ; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) { -- 2.7.4