From 26e76ef0e2cf358809d2b41e657074fc21133d59 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 8 Mar 2019 20:46:15 +0000 Subject: [PATCH] DAG: Don't try to cluster loads with tied inputs This avoids breaking possible value dependencies when sorting loads by offset. AMDGPU has some load instructions that write into the high or low bits of the destination register, and have a tied input for the other input bits. These can easily have the same base pointer, but be a swizzle so the high address load needs to come first. This was inserting glue forcing the opposite ordering, producing a cycle the InstrEmitter would assert on. It may be potentially expensive to look for the dependency between the other loads, so just skip any where this could happen. Fixes bug 40936 by reverting r351379, which added a hacky attempt to fix this by adding chains in this case, which I think was just working around broken glue before the InstrEmitter. The core of the patch is re-implementing the fix for that problem. llvm-svn: 355728 --- .../CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp | 21 +++++++++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 45 ---------------------- llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 38 +++++++++++++++++- 3 files changed, 57 insertions(+), 47 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 559265a..93c5c34 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -205,6 +205,19 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) { if (!Chain) return; + // Skip any load instruction that has a tied input. There may be an additional + // dependency requiring a different order than by increasing offsets, and the + // added glue may introduce a cycle. + auto hasTiedInput = [this](const SDNode *N) { + const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); + for (unsigned I = 0; I != MCID.getNumOperands(); ++I) { + if (MCID.getOperandConstraint(I, MCOI::TIED_TO) != -1) + return true; + } + + return false; + }; + // Look for other loads of the same chain. Find loads that are loading from // the same base pointer and different offsets. SmallPtrSet Visited; @@ -212,6 +225,10 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) { DenseMap O2SMap; // Map from offset to SDNode. bool Cluster = false; SDNode *Base = Node; + + if (hasTiedInput(Base)) + return; + // This algorithm requires a reasonably low use count before finding a match // to avoid uselessly blowing up compile time in large blocks. unsigned UseCount = 0; @@ -222,10 +239,12 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) { continue; int64_t Offset1, Offset2; if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) || - Offset1 == Offset2) + Offset1 == Offset2 || + hasTiedInput(User)) { // FIXME: Should be ok if they addresses are identical. But earlier // optimizations really should have eliminated one of the loads. continue; + } if (O2SMap.insert(std::make_pair(Offset1, Base)).second) Offsets.push_back(Offset1); O2SMap.insert(std::make_pair(Offset2, User)); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3873c46..69e05c3 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9367,51 +9367,6 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } - case AMDGPU::FLAT_LOAD_UBYTE_D16_HI: - case AMDGPU::FLAT_LOAD_SBYTE_D16_HI: - case AMDGPU::FLAT_LOAD_SHORT_D16_HI: - case AMDGPU::GLOBAL_LOAD_UBYTE_D16_HI: - case AMDGPU::GLOBAL_LOAD_SBYTE_D16_HI: - case AMDGPU::GLOBAL_LOAD_SHORT_D16_HI: - case AMDGPU::DS_READ_U16_D16_HI: - case AMDGPU::DS_READ_I8_D16_HI: - case AMDGPU::DS_READ_U8_D16_HI: - case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET: - case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET: - case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET: - case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: - case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: - case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: { - // For these loads that write to the HI part of a register, - // we should chain them to the op that writes to the LO part - // of the register to maintain the order. - unsigned NumOps = Node->getNumOperands(); - SDValue OldChain = Node->getOperand(NumOps-1); - - if (OldChain.getValueType() != MVT::Other) - break; - - // Look for the chain to replace to. - SDValue Lo = Node->getOperand(NumOps-2); - SDNode *LoNode = Lo.getNode(); - if (LoNode->getNumValues() == 1 || - LoNode->getValueType(LoNode->getNumValues() - 1) != MVT::Other) - break; - - SDValue NewChain = Lo.getValue(LoNode->getNumValues() - 1); - if (NewChain == OldChain) // Already replaced. - break; - - SmallVector Ops; - for (unsigned I = 0; I < NumOps-1; ++I) - Ops.push_back(Node->getOperand(I)); - // Repalce the Chain. - Ops.push_back(NewChain); - MachineSDNode *NewNode = DAG.getMachineNode(Opcode, SDLoc(Node), - Node->getVTList(), Ops); - DAG.setNodeMemRefs(NewNode, Node->memoperands()); - return NewNode; - } default: break; } diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 320d008..696b33e 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}chain_hi_to_lo_private: ; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2 @@ -139,3 +139,39 @@ bb: ret <2 x half> %result } + +; Make sure we don't lose any of the private stores. +; GCN-LABEL: {{^}}vload2_private: +; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4 +; GCN: buffer_store_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6 +; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8 + +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6 +; GCN: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8 +define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 { +entry: + %loc = alloca [3 x i16], align 2, addrspace(5) + %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* + %tmp = load i16, i16 addrspace(1)* %in, align 2 + %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0 + store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx + %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 + %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2 + %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1 + store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3 + %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2 + %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2 + %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2 + store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx + %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)* + %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2 + store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4 + %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1 + %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)* + %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2 + %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1 + store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4 + %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)* + ret void +} -- 2.7.4