[AMDGPU] need to insert wait between the scalar load and vector store to the same...
authoralex-t <alexander.timofeev@amd.com>
Sat, 4 Jan 2020 15:23:14 +0000 (18:23 +0300)
committeralex-t <alexander.timofeev@amd.com>
Sat, 4 Jan 2020 15:23:14 +0000 (18:23 +0300)
Reviewers: rampitec, vpykhtin, nhaehnle

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D71934

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll [new file with mode: 0644]

index 927826c..ef662d5 100644 (file)
@@ -42,7 +42,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -372,6 +374,8 @@ private:
   AMDGPU::IsaVersion IV;
 
   DenseSet<MachineInstr *> TrackedWaitcntSet;
+  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
+  MachinePostDominatorTree *PDT;
 
   struct BlockInfo {
     MachineBasicBlock *MBB;
@@ -406,6 +410,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -792,6 +797,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
 
 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                       false)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
                     false)
 
@@ -1012,6 +1018,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
       if (MI.mayStore()) {
         // FIXME: Should not be relying on memoperands.
         for (const MachineMemOperand *Memop : MI.memoperands()) {
+          const Value *Ptr = Memop->getValue();
+          if (SLoadAddresses.count(Ptr)) {
+            addWait(Wait, LGKM_CNT, 0);
+            if (PDT->dominates(MI.getParent(),
+                               SLoadAddresses.find(Ptr)->second))
+              SLoadAddresses.erase(Ptr);
+          }
           unsigned AS = Memop->getAddrSpace();
           if (AS != AMDGPUAS::LOCAL_ADDRESS)
             continue;
@@ -1399,6 +1412,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       }
     }
 
+    if (TII->isSMRD(Inst)) {
+      for (const MachineMemOperand *Memop : Inst.memoperands()) {
+        const Value *Ptr = Memop->getValue();
+        SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
+      }
+    }
+
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1448,6 +1468,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   IV = AMDGPU::getIsaVersion(ST->getCPU());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
 
   ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
   for (auto T : inst_counter_types())
diff --git a/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll b/llvm/test/CodeGen/AMDGPU/smrd_vmem_war.ll
new file mode 100644 (file)
index 0000000..4ba16b4
--- /dev/null
@@ -0,0 +1,29 @@
+; RUN: llc  -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+
+; GCN-LABEL: BB0_1
+; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[ADDR_LO:[0-9]+]]{{\:}}[[ADDR_HI:[0-9]+]]{{\]}}, 0x0
+; GCN: s_waitcnt lgkmcnt(0)
+; GCN: global_store_dword v{{\[}}[[ADDR_LO]]{{\:}}[[ADDR_HI]]{{\]}}, v{{[0-9]+}}, off
+
+define amdgpu_kernel void @zot(i32 addrspace(1)* nocapture %arg, i64 addrspace(1)* nocapture %arg1) {
+bb:
+  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = icmp eq i32 %tmp, 0
+  br i1 %tmp2, label %bb3, label %bb8
+
+bb3:                                              ; preds = %bb
+  %tmp4 = load i32, i32 addrspace(1)* %arg, align 4
+  store i32 0, i32 addrspace(1)* %arg, align 4
+  %tmp5 = zext i32 %tmp4 to i64
+  %tmp6 = load i64, i64 addrspace(1)* %arg1, align 8
+  %tmp7 = add i64 %tmp6, %tmp5
+  store i64 %tmp7, i64 addrspace(1)* %arg1, align 8
+  br label %bb8
+
+bb8:                                              ; preds = %bb3, %bb
+  ret void
+}
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }