From 9392b40d4b1999c9b33490829552d928a2fa9bab Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 12 Jan 2022 16:28:49 -0500
Subject: [PATCH] AMDGPU/GlobalISel: Fix selection of constant 32-bit addrspace
 loads

Unfortunately the selection patterns still rely on the address space
from the memory operand instead of using the pointer type. Add this
address space to the list of cases supported by global-like loads.

Alternatively we would have to adjust the address space of the memory
operand to deviate from the underlying IR value, which looks ugly and
is more work in the legalizer.

This doesn't come up in the DAG path because it uses a different
selection strategy where the cast is inserted during the addressing
mode matching.
---
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td       | 15 ++--
 .../AMDGPU/GlobalISel/load-constant32bit.ll        | 93 ++++++++++++++++++++++
 2 files changed, 103 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 0528b55..4afd940 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -18,6 +18,7 @@ class AddressSpacesImpl {
   int Local = 3;
   int Constant = 4;
   int Private = 5;
+  int Constant32Bit = 6;
 }
 
 def AddrSpaces : AddressSpacesImpl;
@@ -410,13 +411,17 @@ class StoreHi16<SDPatternOperator op> : PatFrag <
   let IsStore = 1;
 }
 
-def LoadAddress_constant : AddressSpaceList<[  AddrSpaces.Constant ]>;
-def LoadAddress_global : AddressSpaceList<[  AddrSpaces.Global, AddrSpaces.Constant ]>;
+def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant,
+                                              AddrSpaces.Constant32Bit ]>;
+def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global,
+                                            AddrSpaces.Constant,
+                                            AddrSpaces.Constant32Bit ]>;
 def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>;
 
-def LoadAddress_flat : AddressSpaceList<[  AddrSpaces.Flat,
-                                           AddrSpaces.Global,
-                                           AddrSpaces.Constant ]>;
+def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat,
+                                          AddrSpaces.Global,
+                                          AddrSpaces.Constant,
+                                          AddrSpaces.Constant32Bit ]>;
 def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>;
 
 def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll
new file mode 100644
index 0000000..f0ed8f0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant32bit.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; FIXME: Test should be redundant with constant-address-space-32bit.ll
+
+; It's important to check with gfx8 and gfx9 to check access through global and flat.
+
+; Custom lowering needs to swap out the MMO address space
+define amdgpu_ps float @load_constant32bit_vgpr_offset(i32 %arg) {
+; GFX6-LABEL: load_constant32bit_vgpr_offset:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[0:1], 0
+; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: load_constant32bit_vgpr_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: load_constant32bit_vgpr_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+entry:
+  %gep = getelementptr <{ [4294967295 x float] }>, <{ [4294967295 x float] }> addrspace(6)* null, i32 0, i32 0, i32 %arg
+  %load = load float, float addrspace(6)* %gep, align 4
+  ret float %load
+}
+
+define amdgpu_ps i32 @load_constant32bit_sgpr_offset(i32 inreg %arg) {
+; GCN-LABEL: load_constant32bit_sgpr_offset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_lshl_b32 s0, s0, 2
+; GCN-NEXT:    s_mov_b32 s1, 0
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %gep = getelementptr <{ [4294967295 x i32] }>, <{ [4294967295 x i32] }> addrspace(6)* null, i32 0, i32 0, i32 %arg
+  %load = load i32, i32 addrspace(6)* %gep, align 4
+  ret i32 %load
+}
+
+; This gets split during regbankselect
+define amdgpu_ps <8 x float> @load_constant32bit_vgpr_v8f32(<8 x float> addrspace(6)* %arg) {
+; GFX6-LABEL: load_constant32bit_vgpr_v8f32:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    v_mov_b32_e32 v4, v0
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[0:1], 0
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx4 v[4:7], v[4:5], s[0:3], 0 addr64 offset:16
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: load_constant32bit_vgpr_v8f32:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
+; GFX8-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, 0, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: load_constant32bit_vgpr_v8f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ; return to shader part epilog
+entry:
+  %load = load <8 x float>, <8 x float> addrspace(6)* %arg, align 32
+  ret <8 x float> %load
+}
-- 
2.7.4