AMDGPU/SI: add llvm.amdgcn.ps.live intrinsic

author Nicolai Haehnle <nhaehnle@gmail.com>

Fri, 22 Apr 2016 04:04:08 +0000 (04:04 +0000)

committer Nicolai Haehnle <nhaehnle@gmail.com>

Fri, 22 Apr 2016 04:04:08 +0000 (04:04 +0000)
author Nicolai Haehnle <nhaehnle@gmail.com>
Fri, 22 Apr 2016 04:04:08 +0000 (04:04 +0000)
committer Nicolai Haehnle <nhaehnle@gmail.com>
Fri, 22 Apr 2016 04:04:08 +0000 (04:04 +0000)
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h

index 07f9350..ea4e483 100644 (file)
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -917,6 +917,10 @@ public:
      return findRegisterDefOperandIdx(Reg, true, false, TRI) != -1;
    }
  
+  /// Returns true if the MachineInstr has an implicit-use operand of exactly
+  /// the given register (not considering sub/super-registers).
+  bool hasRegisterImplicitUseOperand(unsigned Reg) const;
+
    /// Returns the operand index that is a use of the specific register or -1
    /// if it is not found. It further tightens the search criteria to a use
    /// that kills the register if isKill is true.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td

index 9d1b3fc..545dec7 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -328,6 +328,13 @@ def int_amdgcn_interp_p2 :
              [IntrNoMem]>;  // See int_amdgcn_v_interp_p1 for why this is
                             // IntrNoMem.
  
+// Pixel shaders only: whether the current pixel is live (i.e. not a helper
+// invocation for derivative computation).
+def int_amdgcn_ps_live : Intrinsic <
+  [llvm_i1_ty],
+  [],
+  [IntrNoMem]>;
+
  def int_amdgcn_mbcnt_lo :
    GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp

index f13ea41..6b46cb3 100644 (file)
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1271,6 +1271,17 @@ unsigned MachineInstr::getBundleSize() const {
    return Size;
  }
  
+/// Returns true if the MachineInstr has an implicit-use operand of exactly
+/// the given register (not considering sub/super-registers).
+bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg)
+      return true;
+  }
+  return false;
+}
+
  /// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
  /// the specific register or -1 if it is not found. It further tightens
  /// the search criteria to a use that kills the register if isKill is true.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

index b4ff5d0..39da9dd 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -250,6 +250,7 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
    case Intrinsic::amdgcn_buffer_atomic_or:
    case Intrinsic::amdgcn_buffer_atomic_xor:
    case Intrinsic::amdgcn_buffer_atomic_cmpswap:
+  case Intrinsic::amdgcn_ps_live:
      return true;
    }
  
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index fc7cb88..2520291 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1597,8 +1597,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
    // Make sure we aren't losing exec uses in the td files. This mostly requires
    // being careful when using let Uses to try to add other use registers.
    if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
-    const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
-    if (!Exec || !Exec->isImplicit()) {
+    if (!MI->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
        ErrInfo = "VALU instruction does not implicitly read exec mask";
        return false;
      }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td

index 6fc8aeb..b2c059d 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2003,6 +2003,14 @@ def SI_KILL : InstSI <
  
  } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
  
+let SALU = 1 in
+def SI_PS_LIVE : InstSI <
+  (outs SReg_64:$dst),
+  (ins),
+  "si_ps_live $dst",
+  [(set i1:$dst, (int_amdgcn_ps_live))]
+>;
+
  // Used as an isel pseudo to directly emit initialization with an
  // s_mov_b32 rather than a copy of another initialized
  // register. MachineCSE skips copies, and we don't want to have to
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

index 768fca0..dd133d3 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -101,11 +101,12 @@ private:
    DenseMap<const MachineInstr *, InstrInfo> Instructions;
    DenseMap<const MachineBasicBlock *, BlockInfo> Blocks;
    SmallVector<const MachineInstr *, 2> ExecExports;
+  SmallVector<MachineInstr *, 1> LiveMaskQueries;
  
-  char scanInstructions(const MachineFunction &MF, std::vector<WorkItem>& Worklist);
+  char scanInstructions(MachineFunction &MF, std::vector<WorkItem>& Worklist);
    void propagateInstruction(const MachineInstr &MI, std::vector<WorkItem>& Worklist);
    void propagateBlock(const MachineBasicBlock &MBB, std::vector<WorkItem>& Worklist);
-  char analyzeFunction(const MachineFunction &MF);
+  char analyzeFunction(MachineFunction &MF);
  
    void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
                 unsigned SaveWQM, unsigned LiveMaskReg);
@@ -113,6 +114,8 @@ private:
               unsigned SavedWQM);
    void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
  
+  void lowerLiveMaskQueries(unsigned LiveMaskReg);
+
  public:
    static char ID;
  
@@ -148,15 +151,15 @@ FunctionPass *llvm::createSIWholeQuadModePass() {
  
  // Scan instructions to determine which ones require an Exact execmask and
  // which ones seed WQM requirements.
-char SIWholeQuadMode::scanInstructions(const MachineFunction &MF,
+char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                                         std::vector<WorkItem> &Worklist) {
    char GlobalFlags = 0;
  
    for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
-    const MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock &MBB = *BI;
  
      for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
-      const MachineInstr &MI = *II;
+      MachineInstr &MI = *II;
        unsigned Opcode = MI.getOpcode();
        char Flags;
  
@@ -167,8 +170,13 @@ char SIWholeQuadMode::scanInstructions(const MachineFunction &MF,
          Flags = StateExact;
        } else {
          // Handle export instructions with the exec mask valid flag set
-        if (Opcode == AMDGPU::EXP && MI.getOperand(4).getImm() != 0)
-          ExecExports.push_back(&MI);
+        if (Opcode == AMDGPU::EXP) {
+          if (MI.getOperand(4).getImm() != 0)
+            ExecExports.push_back(&MI);
+        } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+          LiveMaskQueries.push_back(&MI);
+        }
+
          continue;
        }
  
@@ -290,7 +298,7 @@ void SIWholeQuadMode::propagateBlock(const MachineBasicBlock &MBB,
    }
  }
  
-char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) {
+char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
    std::vector<WorkItem> Worklist;
    char GlobalFlags = scanInstructions(MF, Worklist);
  
@@ -424,6 +432,16 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
    }
  }
  
+void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+  for (MachineInstr *MI : LiveMaskQueries) {
+    DebugLoc DL = MI->getDebugLoc();
+    unsigned Dest = MI->getOperand(0).getReg();
+    BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+        .addReg(LiveMaskReg);
+    MI->eraseFromParent();
+  }
+}
+
  bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
    if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
      return false;
@@ -431,30 +449,43 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
    Instructions.clear();
    Blocks.clear();
    ExecExports.clear();
+  LiveMaskQueries.clear();
  
    TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
    TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
    MRI = &MF.getRegInfo();
  
    char GlobalFlags = analyzeFunction(MF);
-  if (!(GlobalFlags & StateWQM))
-    return false;
+  if (!(GlobalFlags & StateWQM)) {
+    lowerLiveMaskQueries(AMDGPU::EXEC);
+    return !LiveMaskQueries.empty();
+  }
  
+  // Store a copy of the original live mask when required
    MachineBasicBlock &Entry = MF.front();
    MachineInstr *EntryMI = Entry.getFirstNonPHI();
+  unsigned LiveMaskReg = 0;
+
+  if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
+    LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+        .addReg(AMDGPU::EXEC);
+  }
  
    if (GlobalFlags == StateWQM) {
      // For a shader that needs only WQM, we can just set it once.
      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
              AMDGPU::EXEC).addReg(AMDGPU::EXEC);
+
+    lowerLiveMaskQueries(LiveMaskReg);
+    // EntryMI may become invalid here
      return true;
    }
  
-  // Handle the general case
-  unsigned LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
-      .addReg(AMDGPU::EXEC);
+  lowerLiveMaskQueries(LiveMaskReg);
+  EntryMI = nullptr;
  
+  // Handle the general case
    for (const auto &BII : Blocks)
      processBlock(const_cast<MachineBasicBlock &>(*BII.first), LiveMaskReg,
                   BII.first == &*MF.begin());
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll

new file mode 100644 (file)

index 0000000..fd1a463
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -0,0 +1,59 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
+;
+; Note: We could generate better code here if we recognized earlier that
+; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However,
+; the expectation is that the intrinsic will be used in non-trivial shaders,
+; so such an optimization doesn't seem worth the effort.
+define amdgpu_ps float @test1() {
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.32 = zext i1 %live to i32
+  %r = bitcast i32 %live.32 to float
+  ret float %r
+}
+
+; CHECK-LABEL: {{^}}test2:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]]
+; CHECK: image_sample v0, [[VAR]],
+define amdgpu_ps float @test2() {
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.32 = zext i1 %live to i32
+
+  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %r = extractelement <4 x float> %t, i32 0
+  ret float %r
+}
+
+; CHECK-LABEL: {{^}}test3:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1
+; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]
+; CHECK: ; %dead
+define amdgpu_ps float @test3(i32 %in) {
+entry:
+  %live = call i1 @llvm.amdgcn.ps.live()
+  br i1 %live, label %end, label %dead
+
+dead:
+  %tc.dead = mul i32 %in, 2
+  br label %end
+
+end:
+  %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ]
+  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %r = extractelement <4 x float> %t, i32 0
+  ret float %r
+}
+
+declare i1 @llvm.amdgcn.ps.live() #0
+
+declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }
author	Nicolai Haehnle <nhaehnle@gmail.com>
	Fri, 22 Apr 2016 04:04:08 +0000 (04:04 +0000)
committer	Nicolai Haehnle <nhaehnle@gmail.com>
	Fri, 22 Apr 2016 04:04:08 +0000 (04:04 +0000)
llvm/include/llvm/CodeGen/MachineInstr.h		patch \| blob \| history
llvm/include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
llvm/lib/CodeGen/MachineInstr.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll	[new file with mode: 0644]	patch \| blob