[amdgpu] Lower SGPR-to-VGPR copy in the final phase of ISel.

author Michael Liao <michael.hliao@gmail.com>

Wed, 9 Sep 2020 20:48:03 +0000 (16:48 -0400)

committer Michael Liao <michael.hliao@gmail.com>

Thu, 17 Sep 2020 15:04:17 +0000 (11:04 -0400)
author Michael Liao <michael.hliao@gmail.com>
Wed, 9 Sep 2020 20:48:03 +0000 (16:48 -0400)
committer Michael Liao <michael.hliao@gmail.com>
Thu, 17 Sep 2020 15:04:17 +0000 (11:04 -0400)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

index a24394cdf795f59c2ca274c7908190b25d8c65c1..4df7fd85a5dde074970ab1ed5bcd54a2c01fccee 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1244,6 +1244,11 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
          foldOperand(OpToFold, UseMI, OpNo, FoldList,
                      CopiesToReplace);
        } else {
+        // Skip updating literal use if it's used in the same REQ_SQUENCE as,
+        // if that literal could be inlined, it's just a single use.
+        if (NonInlineUse && NonInlineUse->getParent() == UseMI &&
+            UseMI->isRegSequence())
+          continue;
          if (++NumLiteralUses == 1) {
            NonInlineUse = &*Use;
            NonInlineUseOpNo = OpNo;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index ed0a3a17e71af2a161e9aa253f5b63e942bc3723..b446ac3af9b13e7ca448af0026b9505134131cbb 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -102,6 +102,10 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
    cl::desc("Use indirect register addressing for divergent indexes"),
    cl::init(false));
  
+static cl::opt<bool> EnableLowerSGPRToVGPRCopy(
+    "lower-sgpr-to-vgpr-copy", cl::Hidden,
+    cl::desc("Enable lowering copy from SGPR to VGPR"), cl::init(true));
+
  static bool hasFP32Denormals(const MachineFunction &MF) {
    const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    return Info->getMode().allFP32Denormals();
@@ -11485,6 +11489,59 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
    return false;
  }
  
+// Lower COPY from SGPR to VGPR to real one as they are real transfer instead
+// of COPY.
+static void lowerSGPRToVGPRCopy(MachineFunction &MF, MachineRegisterInfo &MRI,
+                                const SIRegisterInfo &TRI,
+                                const SIInstrInfo &TII) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (auto BI = MBB.begin(), BE = MBB.end(); BI != BE; /*EMPTY*/) {
+      MachineInstr &MI = *BI++;
+
+      auto IsSGPRToVGPRCopy = [&MRI, &TRI](const MachineInstr &MI) {
+        if (!MI.isCopy())
+          return false;
+
+        auto DstReg = MI.getOperand(0).getReg();
+        auto SrcReg = MI.getOperand(1).getReg();
+        const auto *DstRC = DstReg.isVirtual() ? MRI.getRegClass(DstReg)
+                                               : TRI.getPhysRegClass(DstReg);
+        const auto *SrcRC = SrcReg.isVirtual() ? MRI.getRegClass(SrcReg)
+                                               : TRI.getPhysRegClass(SrcReg);
+        return (DstRC == &AMDGPU::VGPR_32RegClass ||
+                DstRC == &AMDGPU::VReg_64RegClass) &&
+               (SrcRC == &AMDGPU::SGPR_32RegClass ||
+                SrcRC == &AMDGPU::SGPR_64RegClass);
+      };
+
+      // Skip if it's not a copy from SGPR to VGPR.
+      if (!IsSGPRToVGPRCopy(MI))
+        continue;
+
+      const MachineOperand &Src = MI.getOperand(1);
+      // FIXME: Need subreg support.
+      if (Src.getSubReg() != AMDGPU::NoSubRegister)
+        continue;
+      // FIXME: Need undef support.
+      if (Src.getReg().isVirtual()) {
+        auto *DefMI = MRI.getVRegDef(Src.getReg());
+        if (!DefMI || DefMI->isImplicitDef())
+          continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Lower COPY: " << MI);
+      unsigned Opcode = (TRI.getRegSizeInBits(Src.getReg(), MRI) == 64)
+                            ? AMDGPU::V_MOV_B64_PSEUDO
+                            : AMDGPU::V_MOV_B32_e32;
+      auto DstReg = MI.getOperand(0).getReg();
+      auto MIB = BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(Opcode), DstReg)
+                     .add(MI.getOperand(1));
+      LLVM_DEBUG(dbgs() << "        to: " << *MIB.getInstr());
+      MI.eraseFromParent();
+    }
+  }
+}
+
  // Figure out which registers should be reserved for stack access. Only after
  // the function is legalized do we know all of the non-spill stack objects or if
  // calls are present.
@@ -11493,6 +11550,10 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+  if (EnableLowerSGPRToVGPRCopy)
+    lowerSGPRToVGPRCopy(MF, MRI, *TRI, *TII);
  
    if (Info->isEntryFunction()) {
      // Callable functions have fixed registers used for stack access.
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll

index badaa16bbfcc5d447f93dc1603f13feea187260c..05f0bafb47c74282135b38674d425987330a5d8d 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -11,7 +11,7 @@
  ; R600-NOT: AND
  ; R600: |PV.{{[XYZW]}}|
  
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
  ; VI: s_bitset0_b32 s{{[0-9]+}}, 31
  define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
    %bc= bitcast i32 %in to float
@@ -24,7 +24,7 @@ define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
  ; R600-NOT: AND
  ; R600: |PV.{{[XYZW]}}|
  
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
  ; VI: s_bitset0_b32 s{{[0-9]+}}, 31
  define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
    %bc= bitcast i32 %in to float
@@ -36,7 +36,7 @@ define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
  ; FUNC-LABEL: {{^}}s_fabs_f32:
  ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
  
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
  ; VI: s_bitset0_b32 s{{[0-9]+}}, 31
  define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
    %fabs = call float @llvm.fabs.f32(float %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll

index a621b04a346c083785ede7cb18b7b7040cdb302f..afae6b43ee5878261ab115b5dffd9d1254a1c452 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -34,7 +34,7 @@ define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x
  ; R600: |PV.{{[XYZW]}}|
  ; R600: -PV
  
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
  ; VI: s_bitset1_b32 s{{[0-9]+}}, 31
  define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
    %bc = bitcast i32 %in to float
@@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in)
  ; R600: |PV.{{[XYZW]}}|
  ; R600: -PV
  
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
  define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
    %bc = bitcast i32 %in to float
    %fabs = call float @fabs(float %bc)
@@ -59,7 +59,7 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %
  }
  
  ; FUNC-LABEL: {{^}}fneg_fabs_f32:
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
  define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
    %fabs = call float @llvm.fabs.f32(float %in)
    %fsub = fsub float -0.000000e+00, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll

new file mode 100644 (file)

index 0000000..f032f17
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs -o - %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7"
+target triple = "amdgcn-amd-amdhsa"
+
+; CHECK-LABEL: {{^}}t0:
+; CHECK: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], s[4:5], 0x0
+; CHECK-COUNT-1: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
+; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, s[[PTR_HI]]
+define protected amdgpu_kernel void @t0(float addrspace(1)* %p, i32 %i0, i32 %j0, i32 %k0) {
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %i = add i32 %0, %i0
+  %j = add i32 %0, %j0
+  %k = add i32 %0, %k0
+  %pi = getelementptr float, float addrspace(1)* %p, i32 %i
+  %vi = load float, float addrspace(1)* %pi
+  %pj = getelementptr float, float addrspace(1)* %p, i32 %j
+  %vj = load float, float addrspace(1)* %pj
+  %sum = fadd float %vi, %vj
+  %pk = getelementptr float, float addrspace(1)* %p, i32 %k
+  store float %sum, float addrspace(1)* %pk
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll

index 4cbd89147722ba07a9ccb48a2c5e22cd214cb0bd..4d9c6a9a540fd5ce97d106e4faa59f220feef2d1 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll
@@ -153,7 +153,9 @@ bb:
  
  ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
  ; GCN:        flat_load_dword
-; GCN:        s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
+; GFX8_9:     s_waitcnt lgkmcnt(0){{$}}
+; GFX8_9:     s_waitcnt vmcnt(0){{$}}
+; GFX10:      s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
  ; GFX10:      s_waitcnt_vscnt null, 0x0
  ; GCN-NEXT:   s_barrier
  define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll

index 127d0bc0fc6869354257fa1b9b21204fe5330450..860e58d33abf4fead4ccb7c09fbe12744bc4596b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -650,12 +650,12 @@ main_body:
  ; CHECK: image_store
  ; CHECK: s_wqm_b64 exec, exec
  ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
-; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
+; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000
  
  ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
  ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
  ; CHECK: [[LOOP:BB[0-9]+_[0-9]+]]: ; %loop
-; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
+; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
  ; CHECK: s_cbranch_vccz [[LOOPHDR]]
  
  ; CHECK: ; %break
author	Michael Liao <michael.hliao@gmail.com>
	Wed, 9 Sep 2020 20:48:03 +0000 (16:48 -0400)
committer	Michael Liao <michael.hliao@gmail.com>
	Thu, 17 Sep 2020 15:04:17 +0000 (11:04 -0400)
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fabs.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/fneg-fabs.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/sgpr-copy-cse.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/wqm.ll		patch \| blob \| history