From bae3636f9692a56cbb2d75271766a4d7eca4e8c3 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 1 Jul 2019 18:50:50 +0000
Subject: [PATCH] AMDGPU/GlobalISel: Handle more input argument intrinsics

llvm-svn: 364836
---
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp      | 101 ++++++++++++---------
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp     |  12 +++
 .../AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll    |   2 +-
 .../AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll    |  18 ++--
 .../AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll   |  19 ++++
 .../AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll  |  18 ++++
 .../GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll  |  17 ++++
 .../AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll     |  18 ++++
 .../AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll      |   2 +-
 9 files changed, 155 insertions(+), 52 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 528a773..adcc904 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -198,6 +198,58 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
   }
 }
 
+// Allocate special inputs passed in user SGPRs.
+static void allocateHSAUserSGPRs(CCState &CCInfo,
+                                 MachineIRBuilder &MIRBuilder,
+                                 MachineFunction &MF,
+                                 const SIRegisterInfo &TRI,
+                                 SIMachineFunctionInfo &Info) {
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info.hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info.hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
+    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info.hasQueuePtr()) {
+    unsigned QueuePtrReg = Info.addQueuePtr(TRI);
+    MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(QueuePtrReg);
+  }
+
+  if (Info.hasKernargSegmentPtr()) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
+    const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+    Register VReg = MRI.createGenericVirtualRegister(P4);
+    MRI.addLiveIn(InputPtrReg, VReg);
+    MIRBuilder.getMBB().addLiveIn(InputPtrReg);
+    MIRBuilder.buildCopy(VReg, InputPtrReg);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
+  if (Info.hasDispatchID()) {
+    unsigned DispatchIDReg = Info.addDispatchID(TRI);
+    MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(DispatchIDReg);
+  }
+
+  if (Info.hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
+    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
+  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+  // these from the dispatch pointer.
+}
+
 static void allocateSystemSGPRs(CCState &CCInfo,
                                 MachineFunction &MF,
                                 SIMachineFunctionInfo &Info,
@@ -272,51 +324,12 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
 
-  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
-  if (Info->hasPrivateSegmentBuffer()) {
-    Register PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
-    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
-    CCInfo.AllocateReg(PrivateSegmentBufferReg);
-  }
-
-  if (Info->hasDispatchPtr()) {
-    Register DispatchPtrReg = Info->addDispatchPtr(*TRI);
-    // FIXME: Need to add reg as live-in
-    CCInfo.AllocateReg(DispatchPtrReg);
-  }
-
-  if (Info->hasQueuePtr()) {
-    Register QueuePtrReg = Info->addQueuePtr(*TRI);
-    // FIXME: Need to add reg as live-in
-    CCInfo.AllocateReg(QueuePtrReg);
-  }
-
-  if (Info->hasKernargSegmentPtr()) {
-    Register InputPtrReg = Info->addKernargSegmentPtr(*TRI);
-    const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
-    Register VReg = MRI.createGenericVirtualRegister(P2);
-    MRI.addLiveIn(InputPtrReg, VReg);
-    MIRBuilder.getMBB().addLiveIn(InputPtrReg);
-    MIRBuilder.buildCopy(VReg, InputPtrReg);
-    CCInfo.AllocateReg(InputPtrReg);
-  }
-
-  if (Info->hasDispatchID()) {
-    unsigned DispatchIDReg = Info->addDispatchID(*TRI);
-    // FIXME: Need to add reg as live-in
-    CCInfo.AllocateReg(DispatchIDReg);
-  }
-
-  if (Info->hasFlatScratchInit()) {
-    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
-    // FIXME: Need to add reg as live-in
-    CCInfo.AllocateReg(FlatScratchInitReg);
-  }
-
   // The infrastructure for normal calling convention lowering is essentially
   // useless for kernels. We want to avoid any kind of legalization or argument
   // splitting.
   if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+    allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info);
+
     unsigned i = 0;
     const unsigned KernArgBaseAlign = 16;
     const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
@@ -352,6 +365,12 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     return true;
   }
 
+  if (Info->hasImplicitBufferPtr()) {
+    unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
+    MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(ImplicitBufferPtrReg);
+  }
+
   unsigned NumArgs = F.arg_size();
   Function::const_arg_iterator CurOrigArg = F.arg_begin();
   const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 653be65..cada5e5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1237,6 +1237,18 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
   case Intrinsic::amdgcn_workgroup_id_z:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_dispatch_ptr:
+    return legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::DISPATCH_PTR);
+  case Intrinsic::amdgcn_queue_ptr:
+    return legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::QUEUE_PTR);
+  case Intrinsic::amdgcn_implicit_buffer_ptr:
+    return legalizePreloadedArgIntrin(
+      MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
+  case Intrinsic::amdgcn_dispatch_id:
+    return legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::DISPATCH_ID);
   default:
     return true;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
index 2ead7bd..0d11aa9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
 
 ; Check that we correctly skip over disabled inputs
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
 ; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]](s32), [[S0]](s32), [[S0]](s32), [[V0]](s32)
 define amdgpu_ps void @ps0(float inreg %arg0, float %psinput0, float %psinput1) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
index 8d87ce1..557ce16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
@@ -2,7 +2,7 @@
 
 
 ; CHECK-LABEL: name: test_f32_inreg
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]]
 define amdgpu_vs void @test_f32_inreg(float inreg %arg0) {
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
@@ -18,7 +18,7 @@ define amdgpu_vs void @test_f32(float %arg0) {
 }
 
 ; CHECK-LABEL: name: test_ptr2_byval
-; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
+; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
 ; CHECK: G_LOAD [[S01]]
 define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) {
    %tmp0 = load volatile i32, i32 addrspace(4)* %arg0
@@ -26,7 +26,7 @@ define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) {
 }
 
 ; CHECK-LABEL: name: test_ptr2_inreg
-; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
+; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
 ; CHECK: G_LOAD [[S01]]
 define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
   %tmp0 = load volatile i32, i32 addrspace(4)* %arg0
@@ -34,8 +34,8 @@ define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
 }
 
 ; CHECK-LABEL: name: test_sgpr_alignment0
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
-; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
+; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
 ; CHECK: G_LOAD [[S23]]
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]]
 define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) {
@@ -45,8 +45,8 @@ define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)*
 }
 
 ; CHECK-LABEL: name: test_order
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
-; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
+; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
 ; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
 ; CHECK: [[V1:%[0-9]+]]:_(s32) = COPY $vgpr1
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[V0]](s32), [[S0]](s32), [[V1]](s32), [[S1]](s32)
@@ -56,8 +56,8 @@ define amdgpu_vs void @test_order(float inreg %arg0, float inreg %arg1, float %a
 }
 
 ; CHECK-LABEL: name: ret_struct
-; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
-; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr1
+; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
+; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
 ; CHECK: $sgpr0 = COPY [[S0]]
 ; CHECK: $sgpr1 = COPY [[S1]]
 ; CHECK: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
new file mode 100644
index 0000000..f0c2067
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.id.ll
@@ -0,0 +1,19 @@
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i64 @llvm.amdgcn.dispatch.id() #1
+
+; GCN-LABEL: {{^}}dispatch_id:
+; GCN: .amd_kernel_code_t
+; GCN: enable_sgpr_dispatch_id = 1
+
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
+  %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
+  store i64 %tmp0, i64 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
new file mode 100644
index 0000000..e2e0dcd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.dispatch.ptr.ll
@@ -0,0 +1,18 @@
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Error on non-HSA target
+
+; GCN-LABEL: {{^}}test:
+; GCN: enable_sgpr_dispatch_ptr = 1
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
+  %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+  %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+  %value = load i32, i32 addrspace(4)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+
+attributes #0 = { readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll
new file mode 100644
index 0000000..2028b8a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.implicit.buffer.ptr.ll
@@ -0,0 +1,17 @@
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Dropped parts from original test
+
+; GCN-LABEL: {{^}}test_ps:
+; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
+define amdgpu_ps i32 @test_ps() #1 {
+  %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
+  %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
+  %value = load volatile i32, i32 addrspace(4)* %buffer_ptr
+  ret i32 %value
+}
+
+declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
new file mode 100644
index 0000000..155a060
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.queue.ptr.ll
@@ -0,0 +1,18 @@
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-code-object-v3 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Error on non-hsa target
+
+; GCN-LABEL: {{^}}test:
+; GCN: enable_sgpr_queue_ptr = 1
+; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
+  %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+  %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+  %value = load i32, i32 addrspace(4)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
index 3a69ef6..e5e1a44 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; FIXME: Requires stack object to not assert
 ; GCN-LABEL: {{^}}test_ps:
-- 
2.7.4