From 381a94a7643afdc9025e24219c9700e29adad68f Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Tue, 12 May 2015 15:00:49 +0000 Subject: [PATCH] R600/SI: Remove explicit m0 operand from DS instructions Instead add m0 as an implicit operand. This helps avoid spills of the m0 register in some cases. llvm-svn: 237141 --- llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 79 +++++++++++----- llvm/lib/Target/R600/AMDGPUInstructions.td | 37 ++++---- llvm/lib/Target/R600/SIInstrFormats.td | 2 +- llvm/lib/Target/R600/SIInstrInfo.td | 125 +++++++++++++++++++++++--- llvm/lib/Target/R600/SIInstructions.td | 96 ++++++++++---------- llvm/lib/Target/R600/SILoadStoreOptimizer.cpp | 38 ++++---- llvm/test/CodeGen/R600/ds_read2st64.ll | 2 +- llvm/test/CodeGen/R600/shl_add_ptr.ll | 2 +- 8 files changed, 261 insertions(+), 120 deletions(-) diff --git a/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 8898cf2..df4461e 100644 --- a/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -78,6 +78,8 @@ private: bool isLocalLoad(const LoadSDNode *N) const; bool isRegionLoad(const LoadSDNode *N) const; + SDNode *glueCopyToM0(SDNode *N) const; + const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, @@ -242,6 +244,32 @@ bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { return true; } +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || + !checkType(cast(N)->getMemOperand()->getValue(), + AMDGPUAS::LOCAL_ADDRESS)) + return N; + + const SITargetLowering& Lowering = + *static_cast(getTargetLowering()); + + // Write max value to m0 before each load operation + + SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), + CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + + SDValue Glue = M0.getValue(1); + + SmallVector Ops; + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + Ops.push_back(N->getOperand(i)); + } + Ops.push_back(Glue); + CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); + + return N; +} + SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -249,6 +277,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return nullptr; // Already selected. } + if (isa(N)) + N = glueCopyToM0(N); + switch (Opc) { default: break; // We are selecting i64 ADD here instead of custom lower it during @@ -423,23 +454,29 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } case ISD::LOAD: { + LoadSDNode *LD = cast(N); + SDLoc SL(N); + EVT VT = N->getValueType(0); + + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { + N = glueCopyToM0(N); + break; + } + // To simplify the TableGen patters, we replace all i64 loads with // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 // during DAG legalization, however, so places (ExpandUnalignedLoad) // in the DAG legalizer assume that if i64 is legal, so doing this // promotion early can cause problems. - EVT VT = N->getValueType(0); - LoadSDNode *LD = cast(N); - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) - break; SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, MVT::i64, NewLoad); CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); - SelectCode(NewLoad.getNode()); + SDNode *Load = glueCopyToM0(NewLoad.getNode()); + SelectCode(Load); N = BitCast.getNode(); break; } @@ -448,24 +485,26 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // Handle i64 stores here for the same reason mentioned above for loads. StoreSDNode *ST = cast(N); SDValue Value = ST->getValue(); - if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) - break; + if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { + + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), + MVT::v2i32, Value); + SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, + ST->getBasePtr(), ST->getMemOperand()); - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), - MVT::v2i32, Value); - SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, - ST->getBasePtr(), ST->getMemOperand()); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); + if (NewValue.getOpcode() == ISD::BITCAST) { + Select(NewStore.getNode()); + return SelectCode(NewValue.getNode()); + } - if (NewValue.getOpcode() == ISD::BITCAST) { - Select(NewStore.getNode()); - return SelectCode(NewValue.getNode()); + // getNode() may fold the bitcast if its input was another bitcast. If that + // happens we should only select the new store. + N = NewStore.getNode(); } - // getNode() may fold the bitcast if its input was another bitcast. If that - // happens we should only select the new store. - N = NewStore.getNode(); + N = glueCopyToM0(N); break; } diff --git a/llvm/lib/Target/R600/AMDGPUInstructions.td b/llvm/lib/Target/R600/AMDGPUInstructions.td index caec481..72cab39 100644 --- a/llvm/lib/Target/R600/AMDGPUInstructions.td +++ b/llvm/lib/Target/R600/AMDGPUInstructions.td @@ -183,12 +183,15 @@ def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isConstantLoad(dyn_cast(N), -1); }]>; -def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{ +class AZExtLoadBase : PatFrag<(ops node:$ptr), + (ld_node node:$ptr), [{ LoadSDNode *L = cast(N); return L->getExtensionType() == ISD::ZEXTLOAD || L->getExtensionType() == ISD::EXTLOAD; }]>; +def az_extload : AZExtLoadBase ; + def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast(N)->getMemoryVT() == MVT::i8; }]>; @@ -361,22 +364,26 @@ def mskor_global : PatFrag<(ops node:$val, node:$ptr), return cast(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; +multiclass AtomicCmpSwapLocal { -def atomic_cmp_swap_32_local : - PatFrag<(ops node:$ptr, node:$cmp, node:$swap), - (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast(N); - return AN->getMemoryVT() == MVT::i32 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; + def _32_local : PatFrag < + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast(N); + return AN->getMemoryVT() == MVT::i32 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; -def atomic_cmp_swap_64_local : - PatFrag<(ops node:$ptr, node:$cmp, node:$swap), - (atomic_cmp_swap node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast(N); - return AN->getMemoryVT() == MVT::i64 && - AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; -}]>; + def _64_local : PatFrag< + (ops node:$ptr, node:$cmp, node:$swap), + (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ + AtomicSDNode *AN = cast(N); + return AN->getMemoryVT() == MVT::i64 && + AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; + }]>; +} + +defm atomic_cmp_swap : AtomicCmpSwapLocal ; def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return isFlatLoad(dyn_cast(N)); diff --git a/llvm/lib/Target/R600/SIInstrFormats.td b/llvm/lib/Target/R600/SIInstrFormats.td index bc693c3..5a505cf 100644 --- a/llvm/lib/Target/R600/SIInstrFormats.td +++ b/llvm/lib/Target/R600/SIInstrFormats.td @@ -604,7 +604,7 @@ class DS pattern> : let LGKM_CNT = 1; let DS = 1; let UseNamedOperandTable = 1; - let DisableEncoding = "$m0"; + let Uses = [M0]; // Most instruction load and store data, so set this as the default. let mayLoad = 1; diff --git a/llvm/lib/Target/R600/SIInstrInfo.td b/llvm/lib/Target/R600/SIInstrInfo.td index 4e482b7..c154bb6 100644 --- a/llvm/lib/Target/R600/SIInstrInfo.td +++ b/llvm/lib/Target/R600/SIInstrInfo.td @@ -124,6 +124,107 @@ def SIconstdata_ptr : SDNode< "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> >; +//===----------------------------------------------------------------------===// +// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 +// to be glued to the memory instructions. +//===----------------------------------------------------------------------===// + +def SIld_local : SDNode <"ISD::LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ + return isLocalLoad(cast(N)); +}]>; + +def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast(N)->getAddressingMode() == ISD::UNINDEXED && + cast(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + +def si_load_local_align8 : Aligned8Bytes < + (ops node:$ptr), (si_load_local node:$ptr) +>; + +def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ + return cast(N)->getExtensionType() == ISD::SEXTLOAD; +}]>; +def si_az_extload_local : AZExtLoadBase ; + +multiclass SIExtLoadLocal { + + def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast(N)->getMemoryVT() == MVT::i8;}] + >; + + def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), + [{return cast(N)->getMemoryVT() == MVT::i16;}] + >; +} + +defm si_sextload_local : SIExtLoadLocal ; +defm si_az_extload_local : SIExtLoadLocal ; + +def SIst_local : SDNode <"ISD::STORE", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] +>; + +def si_st_local : PatFrag < + (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ + return isLocalStore(cast(N)); +}]>; + +def si_store_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast(N)->getAddressingMode() == ISD::UNINDEXED && + !cast(N)->isTruncatingStore(); +}]>; + +def si_store_local_align8 : Aligned8Bytes < + (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) +>; + +def si_truncstore_local : PatFrag < + (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ + return cast(N)->isTruncatingStore(); +}]>; + +def si_truncstore_local_i8 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def si_truncstore_local_i16 : PatFrag < + (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +multiclass SIAtomicM0Glue2 { + + def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] + >; + + def _local : local_binary_atomic_op (NAME#"_glue")>; +} + +defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; +defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; +defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; +defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; +defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; +defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; +defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; +defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; +defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">; + +def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + +defm si_atomic_cmp_swap : AtomicCmpSwapLocal ; + // Transformation function, extract the lower 32bit of a 64bit immediate def LO32 : SDNodeXFormgetTargetConstant(N->getZExtValue() & 0xffffffff, SDLoc(N), @@ -1726,7 +1827,7 @@ class DS_Off16_Real_vi op, string opName, dag outs, dag ins, string asm multiclass DS_1A_RET op, string opName, RegisterClass rc, dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds, M0Reg:$m0), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), string asm = opName#" $vdst, $addr"#"$offset$gds"> { def "" : DS_Pseudo ; @@ -1740,7 +1841,7 @@ multiclass DS_1A_RET op, string opName, RegisterClass rc, multiclass DS_1A_Off8_RET op, string opName, RegisterClass rc, dag outs = (outs rc:$vdst), dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, - gds01:$gds, M0Reg:$m0), + gds01:$gds), string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { def "" : DS_Pseudo ; @@ -1753,8 +1854,7 @@ multiclass DS_1A_Off8_RET op, string opName, RegisterClass rc, multiclass DS_1A1D_NORET op, string opName, RegisterClass rc, dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds, - M0Reg:$m0), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), string asm = opName#" $addr, $data0"#"$offset$gds"> { def "" : DS_Pseudo , @@ -1769,7 +1869,7 @@ multiclass DS_1A1D_NORET op, string opName, RegisterClass rc, multiclass DS_1A1D_Off8_NORET op, string opName, RegisterClass rc, dag outs = (outs), dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds, M0Reg:$m0), + ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds), string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { def "" : DS_Pseudo ; @@ -1783,8 +1883,7 @@ multiclass DS_1A1D_Off8_NORET op, string opName, RegisterClass rc, multiclass DS_1A1D_RET op, string opName, RegisterClass rc, string noRetOp = "", dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds, - M0Reg:$m0), + dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { def "" : DS_Pseudo , @@ -1812,14 +1911,14 @@ multiclass DS_1A2D_RET op, string asm, RegisterClass rc, string noRetOp = "", RegisterClass src = rc> : DS_1A2D_RET_m ; multiclass DS_1A2D_NORET op, string opName, RegisterClass rc, string noRetOp = opName, dag outs = (outs), dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset:$offset, gds:$gds, M0Reg:$m0), + ds_offset:$offset, gds:$gds), string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { def "" : DS_Pseudo , @@ -1833,7 +1932,7 @@ multiclass DS_1A2D_NORET op, string opName, RegisterClass rc, multiclass DS_0A_RET op, string opName, dag outs = (outs VGPR_32:$vdst), - dag ins = (ins ds_offset:$offset, gds:$gds, M0Reg:$m0), + dag ins = (ins ds_offset:$offset, gds:$gds), string asm = opName#" $vdst"#"$offset"#"$gds"> { let mayLoad = 1, mayStore = 1 in { @@ -1848,7 +1947,7 @@ multiclass DS_0A_RET op, string opName, multiclass DS_1A_RET_GDS op, string opName, dag outs = (outs VGPR_32:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset, M0Reg:$m0), + dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset), string asm = opName#" $vdst, $addr"#"$offset gds"> { def "" : DS_Pseudo ; @@ -1861,7 +1960,7 @@ multiclass DS_1A_RET_GDS op, string opName, multiclass DS_1A_GDS op, string opName, dag outs = (outs), - dag ins = (ins VGPR_32:$addr, M0Reg:$m0), + dag ins = (ins VGPR_32:$addr), string asm = opName#" $addr gds"> { def "" : DS_Pseudo ; @@ -1874,7 +1973,7 @@ multiclass DS_1A_GDS op, string opName, multiclass DS_1A op, string opName, dag outs = (outs), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, M0Reg:$m0, gds:$gds), + dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), string asm = opName#" $addr"#"$offset"#"$gds"> { let mayLoad = 1, mayStore = 1 in { diff --git a/llvm/lib/Target/R600/SIInstructions.td b/llvm/lib/Target/R600/SIInstructions.td index 7137baab..24ebd5c 100644 --- a/llvm/lib/Target/R600/SIInstructions.td +++ b/llvm/lib/Target/R600/SIInstructions.td @@ -2824,52 +2824,52 @@ def : ROTRPattern ; class DSReadPat : Pat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), - (inst $ptr, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) + (inst $ptr, (as_i16imm $offset), (i1 0)) >; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; -def : DSReadPat ; +def : DSReadPat ; +def : DSReadPat ; +def : DSReadPat ; +def : DSReadPat ; +def : DSReadPat ; let AddedComplexity = 100 in { -def : DSReadPat ; +def : DSReadPat ; } // End AddedComplexity = 100 def : Pat < - (v2i32 (local_load (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), - (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0), (S_MOV_B32 -1)) + (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) >; class DSWritePat : Pat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), - (inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; -def : DSWritePat ; -def : DSWritePat ; -def : DSWritePat ; +def : DSWritePat ; +def : DSWritePat ; +def : DSWritePat ; let AddedComplexity = 100 in { -def : DSWritePat ; +def : DSWritePat ; } // End AddedComplexity = 100 def : Pat < - (local_store v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1)), + (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, + i8:$offset1)), (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0), (EXTRACT_SUBREG $value, sub1), $offset0, $offset1, - (i1 0), (S_MOV_B32 -1)) + (i1 0)) >; class DSAtomicRetPat : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), - (inst $ptr, $value, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; // Special case of DSAtomicRetPat for add / sub 1 -> inc / dec @@ -2885,53 +2885,53 @@ class DSAtomicRetPat : Pat < class DSAtomicIncRetPat : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), - (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) + (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0)) >; class DSAtomicCmpXChg : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), - (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0), (S_MOV_B32 -1)) + (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) >; // 32-bit atomics. def : DSAtomicIncRetPat; + S_MOV_B32, si_atomic_load_add_local>; def : DSAtomicIncRetPat; + S_MOV_B32, si_atomic_load_sub_local>; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; -def : DSAtomicCmpXChg; +def : DSAtomicCmpXChg; // 64-bit atomics. def : DSAtomicIncRetPat; + S_MOV_B64, si_atomic_load_add_local>; def : DSAtomicIncRetPat; - -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; -def : DSAtomicRetPat; - -def : DSAtomicCmpXChg; + S_MOV_B64, si_atomic_load_sub_local>; + +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; + +def : DSAtomicCmpXChg; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp b/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp index a927ad8..9b1d256 100644 --- a/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/R600/SILoadStoreOptimizer.cpp @@ -213,7 +213,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( // Be careful, since the addresses could be subregisters themselves in weird // cases, like vectors of pointers. const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); unsigned DestReg0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst)->getReg(); unsigned DestReg1 @@ -254,37 +253,24 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 .addImm(0) // gds - .addOperand(*M0Reg) // M0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); - LIS->InsertMachineInstrInMaps(Read2); - unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; updateRegDefsUses(DestReg0, DestReg, SubRegIdx0); updateRegDefsUses(DestReg1, DestReg, SubRegIdx1); LIS->RemoveMachineInstrFromMaps(I); - LIS->RemoveMachineInstrFromMaps(Paired); + // Replacing Paired in the maps with Read2 allows us to avoid updating the + // live range for the m0 register. + LIS->ReplaceMachineInstrInMaps(Paired, Read2); I->eraseFromParent(); Paired->eraseFromParent(); LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); LIS->shrinkToUses(&AddrRegLI); - LiveInterval &M0RegLI = LIS->getInterval(M0Reg->getReg()); - LIS->shrinkToUses(&M0RegLI); - - // Currently m0 is treated as a register class with one member instead of an - // implicit physical register. We are using the virtual register for the first - // one, but we still need to update the live range of the now unused second m0 - // virtual register to avoid verifier errors. - const MachineOperand *PairedM0Reg - = TII->getNamedOperand(*Paired, AMDGPU::OpName::m0); - LiveInterval &PairedM0RegLI = LIS->getInterval(PairedM0Reg->getReg()); - LIS->shrinkToUses(&PairedM0RegLI); - LIS->getInterval(DestReg); // Create new LI DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); @@ -300,7 +286,6 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); - const MachineOperand *M0Reg = TII->getNamedOperand(*I, AMDGPU::OpName::m0); const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); @@ -331,6 +316,13 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = I->getDebugLoc(); + // repairLiveintervalsInRange() doesn't handle physical register, so we have + // to update the M0 range manually. + SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); + LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); + bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); + MachineInstrBuilder Write2 = BuildMI(*MBB, I, DL, Write2Desc) .addOperand(*Addr) // addr @@ -339,21 +331,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 .addImm(0) // gds - .addOperand(*M0Reg) // m0 .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg(), - M0Reg->getReg()}; + unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; LIS->RemoveMachineInstrFromMaps(I); LIS->RemoveMachineInstrFromMaps(Paired); I->eraseFromParent(); Paired->eraseFromParent(); + // This doesn't handle physical registers like M0 LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); + if (UpdateM0Range) { + SlotIndex Write2Index = LIS->getInstructionIndex(Write2); + M0Segment->end = Write2Index.getRegSlot(); + } + DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); return Write2.getInstr(); } diff --git a/llvm/test/CodeGen/R600/ds_read2st64.ll b/llvm/test/CodeGen/R600/ds_read2st64.ll index 54b3b45..b72e3ef 100644 --- a/llvm/test/CodeGen/R600/ds_read2st64.ll +++ b/llvm/test/CodeGen/R600/ds_read2st64.ll @@ -65,8 +65,8 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add ; SI-LABEL: @simple_read2st64_f32_over_max_offset ; SI-NOT: ds_read2st64_b32 -; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} ; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256 +; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], 0x10000, {{v[0-9]+}} ; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]] ; SI: s_endpgm define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { diff --git a/llvm/test/CodeGen/R600/shl_add_ptr.ll b/llvm/test/CodeGen/R600/shl_add_ptr.ll index 066dafb..6671e90 100644 --- a/llvm/test/CodeGen/R600/shl_add_ptr.ll +++ b/llvm/test/CodeGen/R600/shl_add_ptr.ll @@ -69,8 +69,8 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3) ; pointer can be used with an offset into the second one. ; SI-LABEL: {{^}}load_shl_base_lds_2: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI: s_mov_b32 m0, -1 -; SI-NEXT: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 ; SI: s_endpgm define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { -- 2.7.4