Fixes various bugs with reordering loads and stores.
Scalarized vector loads weren't collecting the chains
at all.
llvm-svn: 212473
SelectionDAG &DAG) const {
LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
+ EVT LoadVT = Op.getValueType();
EVT EltVT = Op.getValueType().getVectorElementType();
EVT PtrVT = Load->getBasePtr().getValueType();
+
unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
SmallVector<SDValue, 8> Loads;
+ SmallVector<SDValue, 8> Chains;
+
SDLoc SL(Op);
for (unsigned i = 0, e = NumElts; i != e; ++i) {
SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
- Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
- Load->getChain(), Ptr,
- MachinePointerInfo(Load->getMemOperand()->getValue()),
- MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->getAlignment()));
+
+ SDValue NewLoad
+ = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
+ Load->getChain(), Ptr,
+ MachinePointerInfo(Load->getMemOperand()->getValue()),
+ MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
+ Load->getAlignment());
+ Loads.push_back(NewLoad.getValue(0));
+ Chains.push_back(NewLoad.getValue(1));
}
- return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
+ DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
+ };
+
+ return DAG.getMergeValues(Ops, SL);
}
SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
Load->getBasePtr(),
MemVT,
Load->getMemOperand());
- return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
+ ExtLoad32.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, DL);
}
if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
BasePtr, MVT::i8, MMO);
- return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
+ NewLD.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, DL);
}
// Lower loads constant address space global variable loads
isa<GlobalVariable>(
GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
+
SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
DAG.getConstant(2, MVT::i32));
- return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+ return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
Load->getChain(), Ptr,
DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
}
EVT MemEltVT = MemVT.getScalarType();
if (ExtType == ISD::SEXTLOAD) {
SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
- return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
+ Load->getChain()
+ };
+
+ return DAG.getMergeValues(Ops, DL);
}
- return DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
+ SDValue Ops[] = {
+ DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
+ Load->getChain()
+ };
+
+ return DAG.getMergeValues(Ops, DL);
}
SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
case ISD::FSIN: return LowerTrig(Op, DAG);
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
- case ISD::LOAD: return LowerLOAD(Op, DAG);
+ case ISD::LOAD: {
+ SDValue Result = LowerLOAD(Op, DAG);
+ assert((!Result.getNode() ||
+ Result.getNode()->getNumValues() == 2) &&
+ "Load should return a value and a chain");
+ return Result;
+ }
+
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
case ISD::INTRINSIC_VOID: {
Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
(Load->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
Op.getValueType().getVectorNumElements() > 4))) {
- SDValue MergedValues[2] = {
- SplitVectorLoad(Op, DAG),
- Load->getChain()
- };
- return DAG.getMergeValues(MergedValues, SDLoc(Op));
+ return SplitVectorLoad(Op, DAG);
} else {
- return LowerLOAD(Op, DAG);
+ SDValue Result = LowerLOAD(Op, DAG);
+ assert((!Result.getNode() ||
+ Result.getNode()->getNumValues() == 2) &&
+ "Load should return a value and a chain");
+ return Result;
}
}
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
- SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
- SDValue MergedValues[2];
- MergedValues[1] = Load->getChain();
- if (Ret.getNode()) {
- MergedValues[0] = Ret;
- return DAG.getMergeValues(MergedValues, DL);
- }
+ SDValue Lowered = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
+ if (Lowered.getNode())
+ return Lowered;
if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
return SDValue();
SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
DAG.getConstant(2, MVT::i32));
- Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
- Load->getChain(), Ptr,
- DAG.getTargetConstant(0, MVT::i32),
- Op.getOperand(2));
+
+ // FIXME: REGISTER_LOAD should probably have a chain result.
+ SDValue Chain = Load->getChain();
+ SDValue LoLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+ Chain, Ptr,
+ DAG.getTargetConstant(0, MVT::i32),
+ Op.getOperand(2));
+
+ SDValue Ret = LoLoad.getValue(0);
if (MemVT.getSizeInBits() == 64) {
+ // TODO: This needs a test to make sure the right thing is happening with
+ // the chain. That is hard without general function support.
+
SDValue IncPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
DAG.getConstant(1, MVT::i32));
- SDValue LoadUpper = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
- Load->getChain(), IncPtr,
- DAG.getTargetConstant(0, MVT::i32),
- Op.getOperand(2));
+ SDValue HiLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
+ Chain, IncPtr,
+ DAG.getTargetConstant(0, MVT::i32),
+ Op.getOperand(2));
- Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ret, LoadUpper);
+ Ret = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, LoLoad, HiLoad);
+ // Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ // LoLoad.getValue(1), HiLoad.getValue(1));
}
- MergedValues[0] = Ret;
- return DAG.getMergeValues(MergedValues, DL);
+ SDValue Ops[] = {
+ Ret,
+ Chain
+ };
+ return DAG.getMergeValues(Ops, DL);
}
SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
--- /dev/null
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @no_reorder_v2f64_global_load_store
+; SI: BUFFER_LOAD_DWORDX2
+; SI: BUFFER_LOAD_DWORDX2
+; SI: BUFFER_LOAD_DWORDX2
+; SI: BUFFER_LOAD_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+; SI: BUFFER_STORE_DWORDX2
+; SI: S_ENDPGM
+define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
+ %tmp1 = load <2 x double> addrspace(1)* %x, align 16
+ %tmp4 = load <2 x double> addrspace(1)* %y, align 16
+ store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
+ store <2 x double> %tmp1, <2 x double> addrspace(1)* %y, align 16
+ ret void
+}
+
+; SI-LABEL: @no_reorder_scalarized_v2f64_local_load_store
+; SI: DS_READ_B64
+; SI: DS_READ_B64
+; SI: DS_WRITE_B64
+; SI: DS_WRITE_B64
+; SI: S_ENDPGM
+define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
+ %tmp1 = load <2 x double> addrspace(3)* %x, align 16
+ %tmp4 = load <2 x double> addrspace(3)* %y, align 16
+ store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
+ store <2 x double> %tmp1, <2 x double> addrspace(3)* %y, align 16
+ ret void
+}
+
+; SI-LABEL: @no_reorder_split_v8i32_global_load_store
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+; SI: BUFFER_LOAD_DWORD
+
+
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: BUFFER_STORE_DWORD
+; SI: S_ENDPGM
+define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
+ %tmp1 = load <8 x i32> addrspace(1)* %x, align 32
+ %tmp4 = load <8 x i32> addrspace(1)* %y, align 32
+ store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
+ store <8 x i32> %tmp1, <8 x i32> addrspace(1)* %y, align 32
+ ret void
+}
+
+; SI-LABEL: @no_reorder_extload_64
+; SI: DS_READ_B64
+; SI: DS_READ_B64
+; SI: DS_WRITE_B64
+; SI-NOT: DS_READ
+; SI: DS_WRITE_B64
+; SI: S_ENDPGM
+define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
+ %tmp1 = load <2 x i32> addrspace(3)* %x, align 8
+ %tmp4 = load <2 x i32> addrspace(3)* %y, align 8
+ %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
+ %tmp4ext = zext <2 x i32> %tmp4 to <2 x i64>
+ %tmp7 = add <2 x i64> %tmp1ext, <i64 1, i64 1>
+ %tmp9 = add <2 x i64> %tmp4ext, <i64 1, i64 1>
+ %trunctmp9 = trunc <2 x i64> %tmp9 to <2 x i32>
+ %trunctmp7 = trunc <2 x i64> %tmp7 to <2 x i32>
+ store <2 x i32> %trunctmp9, <2 x i32> addrspace(3)* %x, align 8
+ store <2 x i32> %trunctmp7, <2 x i32> addrspace(3)* %y, align 8
+ ret void
+}