[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM

author Connor Abbott <cwabbott0@gmail.com>

Fri, 4 Aug 2017 18:36:49 +0000 (18:36 +0000)

committer Connor Abbott <cwabbott0@gmail.com>

Fri, 4 Aug 2017 18:36:49 +0000 (18:36 +0000)
author Connor Abbott <cwabbott0@gmail.com>
Fri, 4 Aug 2017 18:36:49 +0000 (18:36 +0000)
committer Connor Abbott <cwabbott0@gmail.com>
Fri, 4 Aug 2017 18:36:49 +0000 (18:36 +0000)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td

index 4e0529a..e5c57e0 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -740,6 +740,13 @@ def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],
    [IntrNoMem, IntrSpeculatable]
  >;
  
+
+// Copies the source value to the destination value, with the guarantee that
+// the source value is computed as if the entire program were executed in WQM.
+def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
+  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+>;
+
  //===----------------------------------------------------------------------===//
  // CI+ Intrinsics
  //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

index 32e83cb..ee2b415 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -338,6 +338,9 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
                                      unsigned &SMovOp,
                                      int64_t &Imm) {
  
+  if (Copy->getOpcode() != AMDGPU::COPY)
+    return false;
+
    if (!MoveImm->isMoveImmediate())
      return false;
  
@@ -564,7 +567,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
        switch (MI.getOpcode()) {
        default:
          continue;
-      case AMDGPU::COPY: {
+      case AMDGPU::COPY:
+      case AMDGPU::WQM: {
          // If the destination register is a physical register there isn't really
          // much we can do to fix this.
          if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index 7abb463..fecad1e 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3942,6 +3942,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                 Op.getOperand(1), Op.getOperand(2));
      return DAG.getNode(ISD::BITCAST, DL, VT, Node);
    }
+  case Intrinsic::amdgcn_wqm: {
+    SDValue Src = Op.getOperand(1);
+    return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
+                   0);
+  }
    default:
      return Op;
    }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

index 1d88452..dc4b999 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2666,6 +2666,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
    case AMDGPU::COPY: return AMDGPU::COPY;
    case AMDGPU::PHI: return AMDGPU::PHI;
    case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
+  case AMDGPU::WQM: return AMDGPU::WQM;
    case AMDGPU::S_MOV_B32:
      return MI.getOperand(1).isReg() ?
             AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -3970,6 +3971,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
    case AMDGPU::PHI:
    case AMDGPU::REG_SEQUENCE:
    case AMDGPU::INSERT_SUBREG:
+  case AMDGPU::WQM:
      if (RI.hasVGPRs(NewDstRC))
        return nullptr;
  
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td

index 50e8061..f20ce4d 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -116,6 +116,11 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
  // pass to enable folding of inline immediates.
  def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                        (ins VSrc_b64:$src0)>;
+
+// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy
+// after the WQM pass processes them.
+def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+
  } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
  
  let usesCustomInserter = 1, SALU = 1 in {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

index a613a22..62e1b7e 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -136,6 +136,7 @@ private:
    DenseMap<const MachineInstr *, InstrInfo> Instructions;
    DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
    SmallVector<MachineInstr *, 1> LiveMaskQueries;
+  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
  
    void printInfo();
  
@@ -162,6 +163,7 @@ private:
    void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
  
    void lowerLiveMaskQueries(unsigned LiveMaskReg);
+  void lowerCopyInstrs();
  
  public:
    static char ID;
@@ -294,6 +296,11 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
          markUsesWQM(MI, Worklist);
          GlobalFlags |= StateWQM;
          continue;
+      } else if (Opcode == AMDGPU::WQM) {
+        // The WQM intrinsic requires its output to have all the helper lanes
+        // correct, so we need it to be in WQM.
+        Flags = StateWQM;
+        LowerToCopyInstrs.push_back(&MI);
        } else if (TII->isDisableWQM(MI)) {
          Flags = StateExact;
        } else {
@@ -666,6 +673,11 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
    }
  }
  
+void SIWholeQuadMode::lowerCopyInstrs() {
+  for (MachineInstr *MI : LowerToCopyInstrs)
+    MI->setDesc(TII->get(AMDGPU::COPY));
+}
+
  bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
    if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
      return false;
@@ -673,6 +685,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
    Instructions.clear();
    Blocks.clear();
    LiveMaskQueries.clear();
+  LowerToCopyInstrs.clear();
  
    const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
  
@@ -708,6 +721,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
            .addReg(AMDGPU::EXEC);
  
        lowerLiveMaskQueries(LiveMaskReg);
+      lowerCopyInstrs();
        // EntryMI may become invalid here
        return true;
      }
@@ -716,6 +730,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
    DEBUG(printInfo());
  
    lowerLiveMaskQueries(LiveMaskReg);
+  lowerCopyInstrs();
  
    // Handle the general case
    for (auto BII : Blocks)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll

index 133aaa3..2f47a52 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -74,6 +74,40 @@ main_body:
    ret <4 x float> %dtex
  }
  
+; Check that WQM is triggered by the wqm intrinsic.
+;
+;CHECK-LABEL: {{^}}test5:
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
+  ret float %out.0
+}
+
+; Check that the wqm intrinsic works correctly for integers.
+;
+;CHECK-LABEL: {{^}}test6:
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %out.0 = bitcast float %out to i32
+  %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
+  %out.2 = bitcast i32 %out.1 to float
+  ret float %out.2
+}
+
  ; Check a case of one branch of an if-else requiring WQM, the other requiring
  ; exact.
  ;
@@ -494,6 +528,8 @@ declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>,
  declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
  declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
  declare void @llvm.AMDGPU.kill(float) #1
+declare float @llvm.amdgcn.wqm.f32(float) #3
+declare i32 @llvm.amdgcn.wqm.i32(i32) #3
  
  attributes #1 = { nounwind }
  attributes #2 = { nounwind readonly }
author	Connor Abbott <cwabbott0@gmail.com>
	Fri, 4 Aug 2017 18:36:49 +0000 (18:36 +0000)
committer	Connor Abbott <cwabbott0@gmail.com>
	Fri, 4 Aug 2017 18:36:49 +0000 (18:36 +0000)
llvm/include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history