From 2232243863bc5f3f3632a9adeab9bf3293543d42 Mon Sep 17 00:00:00 2001 From: Marek Olsak Date: Thu, 26 Oct 2017 14:43:02 +0000 Subject: [PATCH] AMDGPU: Handle s_buffer_load_dword hazard on SI Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D39171 llvm-svn: 316666 --- llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 27 ++++++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/smrd.ll | 17 ++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 76c3ed7..3f54f8b 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -335,6 +335,18 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { // SGPR was written by a VALU instruction. int SmrdSgprWaitStates = 4; auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; + auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; + + bool IsBufferSMRD = SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || + SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM || + SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM || + SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM || + SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM || + SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORD_SGPR || + SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR || + SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR || + SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR || + SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR; for (const MachineOperand &Use : SMRD->uses()) { if (!Use.isReg()) @@ -342,7 +354,22 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { int WaitStatesNeededForUse = SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + // This fixes what appears to be undocumented hardware behavior in SI where + // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor + // needs some number of nops in between. We don't know how many we need, but + // let's use 4. This wasn't discovered before probably because the only + // case when this happens is when we expand a 64-bit pointer into a full + // descriptor and use s_buffer_load_dword instead of s_load_dword, which was + // probably never encountered in the closed-source land. + if (IsBufferSMRD) { + int WaitStatesNeededForUse = + SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), + IsBufferHazardDefFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } } + return WaitStatesNeeded; } diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll index 3f1e1ca..a19768d 100644 --- a/llvm/test/CodeGen/AMDGPU/smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd.ll @@ -84,6 +84,23 @@ entry: ret void } +; GCN-LABEL: {{^}}smrd_hazard: +; GCN-DAG: s_mov_b32 s3, 3 +; GCN-DAG: s_mov_b32 s2, 2 +; GCN-DAG: s_mov_b32 s1, 1 +; GCN-DAG: s_mov_b32 s0, 0 +; SI-NEXT: nop 3 +; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0 +define amdgpu_ps float @smrd_hazard(<4 x i32> inreg %desc) #0 { +main_body: + %d0 = insertelement <4 x i32> undef, i32 0, i32 0 + %d1 = insertelement <4 x i32> %d0, i32 1, i32 1 + %d2 = insertelement <4 x i32> %d1, i32 2, i32 2 + %d3 = insertelement <4 x i32> %d2, i32 3, i32 3 + %r = call float @llvm.SI.load.const.v4i32(<4 x i32> %d3, i32 0) + ret float %r +} + ; SMRD load using the load.const.v4i32 intrinsic with an immediate offset ; GCN-LABEL: {{^}}smrd_load_const0: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 -- 2.7.4