AMDGPU: Track physical registers in SIWholeQuadMode

author Nicolai Haehnle <nhaehnle@gmail.com>

Tue, 2 Aug 2016 19:17:37 +0000 (19:17 +0000)

committer Nicolai Haehnle <nhaehnle@gmail.com>

Tue, 2 Aug 2016 19:17:37 +0000 (19:17 +0000)
author Nicolai Haehnle <nhaehnle@gmail.com>
Tue, 2 Aug 2016 19:17:37 +0000 (19:17 +0000)
committer Nicolai Haehnle <nhaehnle@gmail.com>
Tue, 2 Aug 2016 19:17:37 +0000 (19:17 +0000)
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

index f79fbfe..cb35a05 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -94,12 +94,15 @@ private:
    const SIInstrInfo *TII;
    const SIRegisterInfo *TRI;
    MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
  
    DenseMap<const MachineInstr *, InstrInfo> Instructions;
    DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
    SmallVector<const MachineInstr *, 2> ExecExports;
    SmallVector<MachineInstr *, 1> LiveMaskQueries;
  
+  void markInstruction(MachineInstr &MI, char Flag,
+                       std::vector<WorkItem> &Worklist);
    char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
    void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
    void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -126,6 +129,7 @@ public:
    }
  
    void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
      AU.setPreservesCFG();
      MachineFunctionPass::getAnalysisUsage(AU);
    }
@@ -135,8 +139,11 @@ public:
  
  char SIWholeQuadMode::ID = 0;
  
-INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
-                "SI Whole Quad Mode", false, false)
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                    false)
  
  char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
  
@@ -144,6 +151,23 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
    return new SIWholeQuadMode;
  }
  
+void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
+                                      std::vector<WorkItem> &Worklist) {
+  InstrInfo &II = Instructions[&MI];
+
+  assert(Flag == StateWQM || Flag == StateExact);
+
+  // Ignore if the instruction is already marked. The typical case is that we
+  // mark an instruction WQM multiple times, but for atomics it can happen that
+  // Flag is StateWQM, but Needs is already set to StateExact. In this case,
+  // letting the atomic run in StateExact is correct as per the relevant specs.
+  if (II.Needs)
+    return;
+
+  II.Needs = Flag;
+  Worklist.push_back(&MI);
+}
+
  // Scan instructions to determine which ones require an Exact execmask and
  // which ones seed WQM requirements.
  char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -192,8 +216,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
            continue;
        }
  
-      Instructions[&MI].Needs = Flags;
-      Worklist.push_back(&MI);
+      markInstruction(MI, Flags, Worklist);
        GlobalFlags |= Flags;
      }
  
@@ -249,32 +272,35 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
      if (!Use.isReg() || !Use.isUse())
        continue;
  
-    // At this point, physical registers appear as inputs or outputs
-    // and following them makes no sense (and would in fact be incorrect
-    // when the same VGPR is used as both an output and an input that leads
-    // to a NeedsWQM instruction).
-    //
-    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
-    // have to trace this, in practice it happens for 64-bit computations like
-    // pointers where both dwords are followed already anyway.
-    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
-      continue;
-
-    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
-      InstrInfo &DefII = Instructions[&DefMI];
+    unsigned Reg = Use.getReg();
  
-      // Obviously skip if DefMI is already flagged as NeedWQM.
-      //
-      // The instruction might also be flagged as NeedExact. This happens when
-      // the result of an atomic is used in a WQM computation. In this case,
-      // the atomic must not run for helper pixels and the WQM result is
-      // undefined.
-      if (DefII.Needs != 0)
+    // Handle physical registers that we need to track; this is mostly relevant
+    // for VCC, which can appear as the (implicit) input of a uniform branch,
+    // e.g. when a loop counter is stored in a VGPR.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Reg == AMDGPU::EXEC)
          continue;
  
-      DefII.Needs = StateWQM;
-      Worklist.push_back(&DefMI);
+      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+        LiveRange &LR = LIS->getRegUnit(*RegUnit);
+        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+        if (!Value)
+          continue;
+
+        // Since we're in machine SSA, we do not need to track physical
+        // registers across basic blocks.
+        if (Value->isPHIDef())
+          continue;
+
+        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+                        Worklist);
+      }
+
+      continue;
      }
+
+    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+      markInstruction(DefMI, StateWQM, Worklist);
    }
  }
  
@@ -471,6 +497,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
    TII = ST.getInstrInfo();
    TRI = &TII->getRegisterInfo();
    MRI = &MF.getRegInfo();
+  LIS = &getAnalysis<LiveIntervals>();
  
    char GlobalFlags = analyzeFunction(MF);
    if (!(GlobalFlags & StateWQM)) {
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll

index ff730a0..0063624 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -6,7 +6,6 @@
  ; GCN-LABEL: {{^}}test_branch:
  ; GCNNOOPT: v_writelane_b32
  ; GCNNOOPT: v_writelane_b32
-; GCNNOOPT: v_writelane_b32
  ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
  
  ; GCN: ; BB#1
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll

index 23b0ffd..bddcc07 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -350,6 +350,44 @@ main_body:
    ret float %s
  }
  
+; CHECK-LABEL: {{^}}test_loop_vcc:
+; CHECK-NEXT: ; %entry
+; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: image_store
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], -2
+; CHECK: s_branch [[LOOPHDR:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[LOOPHDR]]: ; %loop
+; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]]
+; CHECK: v_cmp_lt_i32_e32 vcc, 7, [[CTR]]
+; CHECK: s_cbranch_vccz
+; CHECK: ; %break
+
+; CHECK: ; return
+define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
+entry:
+  call void @llvm.amdgcn.image.store.v4i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
+  br label %loop
+
+loop:
+  %ctr.iv = phi i32 [ 0, %entry ], [ %ctr.next, %body ]
+  %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
+  %cc = icmp sgt i32 %ctr.iv, 7
+  br i1 %cc, label %break, label %body
+
+body:
+  %c.i = bitcast <4 x float> %c.iv to <4 x i32>
+  %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %ctr.next = add i32 %ctr.iv, 2
+  br label %loop
+
+break:
+  ret <4 x float> %c.iv
+}
+
  declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
  
  declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
author	Nicolai Haehnle <nhaehnle@gmail.com>
	Tue, 2 Aug 2016 19:17:37 +0000 (19:17 +0000)
committer	Nicolai Haehnle <nhaehnle@gmail.com>
	Tue, 2 Aug 2016 19:17:37 +0000 (19:17 +0000)
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/basic-branch.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history