[AMDGPU] Revert b0efc4fd6 (https://reviews.llvm.org/D40556)

author Alexander Timofeev <Alexander.Timofeev@amd.com>

Wed, 25 Apr 2018 12:32:46 +0000 (12:32 +0000)

committer Alexander Timofeev <Alexander.Timofeev@amd.com>

Wed, 25 Apr 2018 12:32:46 +0000 (12:32 +0000)
author Alexander Timofeev <Alexander.Timofeev@amd.com>
Wed, 25 Apr 2018 12:32:46 +0000 (12:32 +0000)
committer Alexander Timofeev <Alexander.Timofeev@amd.com>
Wed, 25 Apr 2018 12:32:46 +0000 (12:32 +0000)
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

index 8b155c2..e26bc99 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -81,7 +81,6 @@
  #include "llvm/CodeGen/MachineInstrBuilder.h"
  #include "llvm/CodeGen/MachineOperand.h"
  #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
  #include "llvm/CodeGen/TargetRegisterInfo.h"
  #include "llvm/Pass.h"
  #include "llvm/Support/CodeGen.h"
@@ -110,12 +109,7 @@ namespace {
  
  class SIFixSGPRCopies : public MachineFunctionPass {
    MachineDominatorTree *MDT;
-  MachinePostDominatorTree *MPDT;
-  DenseMap<MachineBasicBlock *, SetVector<MachineBasicBlock*>> PDF;
-  void computePDF(MachineFunction * MF);
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void printPDF();
-#endif
+
  public:
    static char ID;
  
@@ -128,8 +122,6 @@ public:
    void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addRequired<MachineDominatorTree>();
      AU.addPreserved<MachineDominatorTree>();
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.addPreserved<MachinePostDominatorTree>();
      AU.setPreservesCFG();
      MachineFunctionPass::getAnalysisUsage(AU);
    }
@@ -417,6 +409,12 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
    return false;
  }
  
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+                                        const TargetRegisterInfo *TRI) {
+  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
+           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
+}
+
  // Checks if there is potential path From instruction To instruction.
  // If CutOff is specified and it sits in between of that path we ignore
  // a higher portion of the path and report it is not reachable.
@@ -567,47 +565,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
    return Changed;
  }
  
-void SIFixSGPRCopies::computePDF(MachineFunction *MF) {
-  MachineFunction::iterator B = MF->begin();
-  MachineFunction::iterator E = MF->end();
-  for (; B != E; ++B) {
-    if (B->succ_size() > 1) {
-      for (auto S : B->successors()) {
-        MachineDomTreeNode *runner = MPDT->getNode(&*S);
-        MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom();
-        while (runner && runner != sentinel) {
-          PDF[runner->getBlock()].insert(&*B);
-          runner = runner->getIDom();
-        }
-      }
-    }
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SIFixSGPRCopies::printPDF() {
-  dbgs() << "\n######## PostDominanceFrontiers set #########\n";
-  for (auto &I : PDF) {
-    dbgs() << "PDF[ " << I.first->getNumber() << "] : ";
-    for (auto &J : I.second) {
-      dbgs() << J->getNumber() << ' ';
-    }
-    dbgs() << '\n';
-  }
-  dbgs() << "\n##############################################\n";
-}
-#endif
-
  bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
    const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
    MachineRegisterInfo &MRI = MF.getRegInfo();
    const SIRegisterInfo *TRI = ST.getRegisterInfo();
    const SIInstrInfo *TII = ST.getInstrInfo();
    MDT = &getAnalysis<MachineDominatorTree>();
-  MPDT = &getAnalysis<MachinePostDominatorTree>();
-  PDF.clear();
-  computePDF(&MF);
-  DEBUG(printPDF());
  
    SmallVector<MachineInstr *, 16> Worklist;
  
@@ -661,27 +624,15 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
          if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
            break;
  
-        // We don't need to fix the PHI if all the source blocks
-        // have no divergent control dependecies
+        // We don't need to fix the PHI if the common dominator of the
+        // two incoming blocks terminates with a uniform branch.
          bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
-        if (!HasVGPROperand) {
-          bool Uniform = true;
-          MachineBasicBlock * Join = MI.getParent();
-          for (auto &O : MI.explicit_operands()) {
-            if (O.isMBB()) {
-              MachineBasicBlock * Source = O.getMBB();
-              SetVector<MachineBasicBlock*> &SourcePDF = PDF[Source];
-              SetVector<MachineBasicBlock*> &JoinPDF   = PDF[Join];
-              SetVector<MachineBasicBlock*> CDList;
-              for (auto &I : SourcePDF) {
-                if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) {
-                  if (hasTerminatorThatModifiesExec(*I, *TRI))
-                    Uniform = false;
-                }
-              }
-            }
-          }
-          if (Uniform) {
+        if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
+          MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
+          MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
+
+          if (!predsHasDivergentTerminator(MBB0, TRI) &&
+              !predsHasDivergentTerminator(MBB1, TRI)) {
              DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
              break;
            }
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll

index c51b8e0..f8a0de6 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -90,7 +90,7 @@ endif:
  }
  
  ; GCN-LABEL: {{^}}divergent_loop:
-; VGPR: workitem_private_segment_byte_size = 12{{$}}
+; VGPR: workitem_private_segment_byte_size = 16{{$}}
  
  ; GCN: {{^}}; %bb.0:
  
@@ -124,9 +124,10 @@ endif:
  ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
  ; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
  ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
-; GCN: s_cmp_lg_u32 s{{[0-9]+}}, s{{[0-9]+}}
+; GCN: v_cmp_ne_u32_e32 vcc,
+; GCN: s_and_b64 vcc, exec, vcc
  ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
-; GCN-NEXT: s_cbranch_scc1 [[LOOP]]
+; GCN-NEXT: s_cbranch_vccnz [[LOOP]]
  
  
  ; GCN: [[END]]:
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-PHI.ll b/llvm/test/CodeGen/AMDGPU/uniform-PHI.ll

deleted file mode 100644 (file)

index 3cb86b3..0000000
--- a/llvm/test/CodeGen/AMDGPU/uniform-PHI.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: BB0_2
-; GCN-NOT: v_readfirstlane
-
-
-target triple = "amdgcn--amdhsa"
-define amdgpu_kernel void @uniform-PHI(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
-bb:
-  %tmp = sext i32 %arg2 to i64
-  %tmp3 = tail call i64 @_Z13get_global_idj(i32 0) #2
-  %tmp4 = icmp ugt i64 %tmp3, %tmp
-  %tmp5 = icmp sgt i32 %arg2, 0
-  %tmp6 = and i1 %tmp4, %tmp5
-  br i1 %tmp6, label %bb7, label %bb17
-
-bb7:                                              ; preds = %bb
-  br label %bb8
-
-bb8:                                              ; preds = %bb8, %bb7
-  %tmp9 = phi i32 [ %tmp15, %bb8 ], [ 0, %bb7 ]
-  %tmp10 = phi i32 [ %tmp14, %bb8 ], [ 0, %bb7 ]
-  %tmp11 = zext i32 %tmp9 to i64
-  %tmp12 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp11
-  %tmp13 = load i32, i32 addrspace(1)* %tmp12, align 4
-  %tmp14 = add nsw i32 %tmp13, %tmp10
-  %tmp15 = add nuw nsw i32 %tmp9, 1
-  %tmp16 = icmp eq i32 %tmp15, %arg2
-  br i1 %tmp16, label %bb17, label %bb8
-
-bb17:                                             ; preds = %bb8, %bb
-  %tmp18 = phi i32 [ 0, %bb ], [ %tmp14, %bb8 ]
-  store i32 %tmp18, i32 addrspace(1)* %arg1, align 4
-  ret void
-}
-
-declare i64 @_Z13get_global_idj(i32) local_unnamed_addr #1
-attributes #1 = { convergent nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "target-features"="+16-bit-insts,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { convergent nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll

index 1bbda66..82283f3 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -6,10 +6,11 @@
  ; CHECK: v_cmp_ne_u32_e32 vcc, 0
  ; CHECK: s_and_saveexec_b64
  ; CHECK-NEXT: ; mask branch
+; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}}
  ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader
  
  ; CHECK: [[LOOP_BODY_LABEL:BB[0-9]+_[0-9]+]]:
-; CHECK: s_cbranch_scc0 [[LOOP_BODY_LABEL]]
+; CHECK: s_cbranch_vccz [[LOOP_BODY_LABEL]]
  
  ; CHECK: s_endpgm
  define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) {
@@ -34,6 +35,7 @@ out:
  ; CHECK-LABEL: {{^}}test2:
  ; CHECK: s_and_saveexec_b64
  ; CHECK-NEXT: ; mask branch
+; CHECK-NEXT: s_cbranch_execz
  define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
  main_body:
    %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll

index 4a3937e..1d0856b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -162,8 +162,8 @@ exit:
  ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
  ; SI: buffer_load_dword
  ; SI-DAG: buffer_store_dword
-; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100
-; SI: s_cbranch_scc0 [[LABEL_LOOP]]
+; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100
+; SI: s_cbranch_vccz [[LABEL_LOOP]]
  ; SI: [[LABEL_EXIT]]:
  ; SI: s_endpgm
author	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Wed, 25 Apr 2018 12:32:46 +0000 (12:32 +0000)
committer	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Wed, 25 Apr 2018 12:32:46 +0000 (12:32 +0000)
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/uniform-PHI.ll	[deleted file]	patch \| blob \| history
llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/valu-i1.ll		patch \| blob \| history