[AMDGPU] Don't handle export done when unify exit nodes

author Ruiling Song <ruiling.song@amd.com>

Thu, 8 Jul 2021 01:42:06 +0000 (09:42 +0800)

committer Ruiling Song <ruiling.song@amd.com>

Wed, 14 Jul 2021 06:54:37 +0000 (14:54 +0800)
author Ruiling Song <ruiling.song@amd.com>
Thu, 8 Jul 2021 01:42:06 +0000 (09:42 +0800)
committer Ruiling Song <ruiling.song@amd.com>
Wed, 14 Jul 2021 06:54:37 +0000 (14:54 +0800)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

index b581526..4e3d5fd 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -68,7 +68,7 @@ public:
    void getAnalysisUsage(AnalysisUsage &AU) const override;
    BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
                                    ArrayRef<BasicBlock *> ReturningBlocks,
-                                  bool InsertExport, StringRef Name);
+                                  StringRef Name);
    bool runOnFunction(Function &F) override;
  };
  
@@ -133,47 +133,15 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
    return true;
  }
  
-static void removeDoneExport(Function &F) {
-  ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
-  for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
-      if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
-        if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
-          Intrin->setArgOperand(6, BoolFalse); // done
-        } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
-          Intrin->setArgOperand(4, BoolFalse); // done
-        }
-      }
-    }
-  }
-}
-
  BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
      Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
-    bool InsertExport, StringRef Name) {
+    StringRef Name) {
    // Otherwise, we need to insert a new basic block into the function, add a PHI
    // nodes (if the function returns values), and convert all of the return
    // instructions into unconditional branches.
    BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
    IRBuilder<> B(NewRetBlock);
  
-  if (InsertExport) {
-    // Ensure that there's only one "done" export in the shader by removing the
-    // "done" bit set on the original final export. More than one "done" export
-    // can lead to undefined behavior.
-    removeDoneExport(F);
-
-    Value *Undef = UndefValue::get(B.getFloatTy());
-    B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
-                      {
-                        B.getInt32(AMDGPU::Exp::ET_NULL),
-                        B.getInt32(0), // enabled channels
-                        Undef, Undef, Undef, Undef, // values
-                        B.getTrue(), // done
-                        B.getTrue(), // valid mask
-                      });
-  }
-
    PHINode *PN = nullptr;
    if (F.getReturnType()->isVoidTy()) {
      B.CreateRetVoid();
@@ -181,7 +149,6 @@ BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
      // If the function doesn't return void... add a PHI node to the block...
      PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
                       "UnifiedRetVal");
-    assert(!InsertExport);
      B.CreateRet(PN);
    }
  
@@ -221,10 +188,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
  
    auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
  
-  // If there's only one exit, we don't need to do anything, unless this is a
-  // pixel shader and that exit is an infinite loop, since we still have to
-  // insert an export in that case.
-  if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
+  // If there's only one exit, we don't need to do anything.
+  if (PDT.root_size() <= 1)
      return false;
  
    LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
@@ -233,14 +198,11 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
    // Loop over all of the blocks in a function, tracking all of the blocks that
    // return.
    SmallVector<BasicBlock *, 4> ReturningBlocks;
-  SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
    SmallVector<BasicBlock *, 4> UnreachableBlocks;
  
    // Dummy return block for infinite loop.
    BasicBlock *DummyReturnBB = nullptr;
  
-  bool InsertExport = false;
-
    bool Changed = false;
    std::vector<DominatorTree::UpdateType> Updates;
  
@@ -248,8 +210,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
      if (isa<ReturnInst>(BB->getTerminator())) {
        if (!isUniformlyReached(DA, *BB))
          ReturningBlocks.push_back(BB);
-      else
-        UniformlyReachedRetBlocks.push_back(BB);
      } else if (isa<UnreachableInst>(BB->getTerminator())) {
        if (!isUniformlyReached(DA, *BB))
          UnreachableBlocks.push_back(BB);
@@ -261,36 +221,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
                                             "DummyReturnBlock", &F);
          Type *RetTy = F.getReturnType();
          Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
-
-        // For pixel shaders, the producer guarantees that an export is
-        // executed before each return instruction. However, if there is an
-        // infinite loop and we insert a return ourselves, we need to uphold
-        // that guarantee by inserting a null export. This can happen e.g. in
-        // an infinite loop with kill instructions, which is supposed to
-        // terminate. However, we don't need to do this if there is a non-void
-        // return value, since then there is an epilog afterwards which will
-        // still export.
-        //
-        // Note: In the case where only some threads enter the infinite loop,
-        // this can result in the null export happening redundantly after the
-        // original exports. However, The last "real" export happens after all
-        // the threads that didn't enter an infinite loop converged, which
-        // means that the only extra threads to execute the null export are
-        // threads that entered the infinite loop, and they only could've
-        // exited through being killed which sets their exec bit to 0.
-        // Therefore, unless there's an actual infinite loop, which can have
-        // invalid results, or there's a kill after the last export, which we
-        // assume the frontend won't do, this export will have the same exec
-        // mask as the last "real" export, and therefore the valid mask will be
-        // overwritten with the same value and will still be correct. Also,
-        // even though this forces an extra unnecessary export wait, we assume
-        // that this happens rare enough in practice to that we don't have to
-        // worry about performance.
-        if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
-            RetTy->isVoidTy()) {
-          InsertExport = true;
-        }
-
          ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
          ReturningBlocks.push_back(DummyReturnBB);
        }
@@ -382,20 +312,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
    if (ReturningBlocks.empty())
      return Changed; // No blocks return
  
-  if (ReturningBlocks.size() == 1 && !InsertExport)
+  if (ReturningBlocks.size() == 1)
      return Changed; // Already has a single return block
  
-  // Unify returning blocks. If we are going to insert the export it is also
-  // necessary to include blocks that are uniformly reached, because in addition
-  // to inserting the export the "done" bits on existing exports will be cleared
-  // and we do not want to end up with the normal export in a non-unified,
-  // uniformly reached block with the "done" bit cleared.
-  auto BlocksToUnify = std::move(ReturningBlocks);
-  if (InsertExport) {
-    llvm::append_range(BlocksToUnify, UniformlyReachedRetBlocks);
-  }
-
-  unifyReturnBlockSet(F, DTU, BlocksToUnify, InsertExport,
-                      "UnifiedReturnBlock");
+  unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
    return true;
  }
diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll

index 6252147..c09be2b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll
@@ -34,11 +34,8 @@ define amdgpu_ps void @return_void(float %0) #0 {
  ; CHECK-NEXT:  ; %bb.4: ; %end
  ; CHECK-NEXT:    v_mov_b32_e32 v0, 1.0
  ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    exp mrt0 v1, v1, v1, v0 vm
+; CHECK-NEXT:    exp mrt0 v1, v1, v1, v0 done vm
  ; CHECK-NEXT:  BB0_5: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_waitcnt expcnt(0)
-; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
-; CHECK-NEXT:    exp null off, off, off, off done vm
  ; CHECK-NEXT:    s_endpgm
  ; CHECK-NEXT:  BB0_6:
  ; CHECK-NEXT:    s_mov_b64 exec, 0
@@ -81,11 +78,8 @@ define amdgpu_ps void @return_void_compr(float %0) #0 {
  ; CHECK-NEXT:    s_cbranch_execz BB1_5
  ; CHECK-NEXT:  ; %bb.4: ; %end
  ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    exp mrt0 v0, off, v0, off compr vm
+; CHECK-NEXT:    exp mrt0 v0, off, v0, off done compr vm
  ; CHECK-NEXT:  BB1_5: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_waitcnt expcnt(0)
-; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
-; CHECK-NEXT:    exp null off, off, off, off done vm
  ; CHECK-NEXT:    s_endpgm
  ; CHECK-NEXT:  BB1_6:
  ; CHECK-NEXT:    s_mov_b64 exec, 0
@@ -112,16 +106,12 @@ define amdgpu_ps void @only_kill() #0 {
  ; CHECK-NEXT:  BB2_1: ; %loop
  ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
  ; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
-; CHECK-NEXT:    s_cbranch_scc0 BB2_4
+; CHECK-NEXT:    s_cbranch_scc0 BB2_3
  ; CHECK-NEXT:  ; %bb.2: ; %loop
  ; CHECK-NEXT:    ; in Loop: Header=BB2_1 Depth=1
  ; CHECK-NEXT:    s_mov_b64 exec, 0
-; CHECK-NEXT:    s_mov_b64 vcc, exec
-; CHECK-NEXT:    s_cbranch_execnz BB2_1
-; CHECK-NEXT:  ; %bb.3: ; %UnifiedReturnBlock
-; CHECK-NEXT:    exp null off, off, off, off done vm
-; CHECK-NEXT:    s_endpgm
-; CHECK-NEXT:  BB2_4:
+; CHECK-NEXT:    s_branch BB2_1
+; CHECK-NEXT:  BB2_3:
  ; CHECK-NEXT:    s_mov_b64 exec, 0
  ; CHECK-NEXT:    exp null off, off, off, off done vm
  ; CHECK-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

index 76526f9..12b9287 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -724,24 +724,16 @@ bb5:                                              ; preds = %bb3
  
  ; IR-LABEL: @uniformly_reached_export
  ; IR-NEXT: .entry:
-; IR: br i1 [[CND:%.*]], label %[[EXP:.*]], label %[[FLOW:.*]]
-
-; IR: [[FLOW]]:
-; IR-NEXT: phi
-; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock
+; IR: br i1 [[CND:%.*]], label %[[LOOP:.*]], label %[[EXP:.*]]
  
  ; IR: [[LOOP]]:
-; IR-NEXT: br i1 false, label %[[FLOW1:.*]], label %[[LOOP]]
+; IR-NEXT: br i1 false, label %DummyReturnBlock, label %[[LOOP]]
  
  ; IR: [[EXP]]:
-; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg false, i1 immarg true)
-; IR-NEXT: br label %[[FLOW]]
-
-; IR: [[FLOW1]]:
-; IR-NEXT: br label %UnifiedReturnBlock
+; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true)
+; IR-NEXT: ret void
  
-; IR: UnifiedReturnBlock:
-; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
+; IR: DummyReturnBlock:
  ; IR-NEXT: ret void
  
  define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) {
diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll

index c13eb0a..50666be 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/update-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll
@@ -14,11 +14,12 @@ define amdgpu_ps void @_amdgpu_ps_main() local_unnamed_addr #3 {
  ; IR-NEXT:    [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ]
  ; IR-NEXT:    [[N29]] = fadd float [[DOT01]], 1.000000e+00
  ; IR-NEXT:    [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00
-; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]]
  ; IR:       TransitionBlock:
  ; IR-NEXT:    br i1 [[N30]], label [[DOTLOOPEXIT]], label [[N28]]
-; IR:       UnifiedReturnBlock:
-; IR-NEXT:    call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true)
+; IR:       n31:
+; IR-NEXT:    ret void
+; IR:       DummyReturnBlock:
  ; IR-NEXT:    ret void
  ;
  .entry:
author	Ruiling Song <ruiling.song@amd.com>
	Thu, 8 Jul 2021 01:42:06 +0000 (09:42 +0800)
committer	Ruiling Song <ruiling.song@amd.com>
	Wed, 14 Jul 2021 06:54:37 +0000 (14:54 +0800)
llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/update-phi.ll		patch \| blob \| history