[AMDGPU] Move atomic expand past infer address spaces

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Tue, 6 Jul 2021 21:34:28 +0000 (14:34 -0700)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Tue, 6 Jul 2021 22:53:32 +0000 (15:53 -0700)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Tue, 6 Jul 2021 21:34:28 +0000 (14:34 -0700)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Tue, 6 Jul 2021 22:53:32 +0000 (15:53 -0700)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 395672f..1b4cbc9 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -888,9 +888,6 @@ void AMDGPUPassConfig::addIRPasses() {
    // A call to propagate attributes pass in the backend in case opt was not run.
    addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
  
-  addPass(createAtomicExpandPass());
-
-
    addPass(createAMDGPULowerIntrinsicsPass());
  
    // Function calls are not supported, so make sure we inline everything.
@@ -921,8 +918,12 @@ void AMDGPUPassConfig::addIRPasses() {
      addPass(createAMDGPULowerModuleLDSPass());
    }
  
-  if (TM.getOptLevel() > CodeGenOpt::None) {
+  if (TM.getOptLevel() > CodeGenOpt::None)
      addPass(createInferAddressSpacesPass());
+
+  addPass(createAtomicExpandPass());
+
+  if (TM.getOptLevel() > CodeGenOpt::None) {
      addPass(createAMDGPUPromoteAlloca());
  
      if (EnableSROA)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll

index 56c76c6..0b0bb6b 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -724,6 +724,81 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(float addrspace(1)*
    ret void
  }
  
+define amdgpu_kernel void @infer_as_before_atomic(float* addrspace(4)* %arg) #0 {
+; GFX900-LABEL: infer_as_before_atomic:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT:    s_mov_b64 s[2:3], 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v1, s4
+; GFX900-NEXT:  BB9_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_add_f32_e32 v0, 1.0, v1
+; GFX900-NEXT:    v_mov_b32_e32 v2, 0
+; GFX900-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX900-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v1, v0
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX900-NEXT:    s_cbranch_execnz BB9_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_endpgm
+;
+; GFX908-LABEL: infer_as_before_atomic:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: infer_as_before_atomic:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX10-LABEL: infer_as_before_atomic:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:  BB9_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT:    s_cbranch_execnz BB9_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_endpgm
+  %load = load float*, float* addrspace(4)* %arg
+  %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4
+  ret void
+}
+
  attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
  attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" "amdgpu-unsafe-fp-atomics"="true" }
  attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

index b62d690..4d42307 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -34,7 +34,6 @@
  ; GCN-O0-NEXT:     Fix function bitcasts for AMDGPU
  ; GCN-O0-NEXT:     FunctionPass Manager
  ; GCN-O0-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O0-NEXT:       Expand Atomic instructions
  ; GCN-O0-NEXT:     AMDGPU Lower Intrinsics
  ; GCN-O0-NEXT:     AMDGPU Inline All Functions
  ; GCN-O0-NEXT:     CallGraph Construction
@@ -44,6 +43,7 @@
  ; GCN-O0-NEXT:     Lower OpenCL enqueued blocks
  ; GCN-O0-NEXT:     Lower uses of LDS variables from non-kernel functions
  ; GCN-O0-NEXT:     FunctionPass Manager
+; GCN-O0-NEXT:     Expand Atomic instructions
  ; GCN-O0-NEXT:       Lower Garbage Collection Instructions
  ; GCN-O0-NEXT:       Shadow Stack GC Lowering
  ; GCN-O0-NEXT:       Lower constant intrinsics
@@ -177,7 +177,6 @@
  ; GCN-O1-NEXT:     Fix function bitcasts for AMDGPU
  ; GCN-O1-NEXT:     FunctionPass Manager
  ; GCN-O1-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O1-NEXT:       Expand Atomic instructions
  ; GCN-O1-NEXT:     AMDGPU Lower Intrinsics
  ; GCN-O1-NEXT:     AMDGPU Inline All Functions
  ; GCN-O1-NEXT:     CallGraph Construction
@@ -188,6 +187,7 @@
  ; GCN-O1-NEXT:     Lower uses of LDS variables from non-kernel functions
  ; GCN-O1-NEXT:     FunctionPass Manager
  ; GCN-O1-NEXT:       Infer address spaces
+; GCN-O1-NEXT:       Expand Atomic instructions
  ; GCN-O1-NEXT:       AMDGPU Promote Alloca
  ; GCN-O1-NEXT:       Dominator Tree Construction
  ; GCN-O1-NEXT:       SROA
@@ -425,7 +425,6 @@
  ; GCN-O1-OPTS-NEXT:     Fix function bitcasts for AMDGPU
  ; GCN-O1-OPTS-NEXT:     FunctionPass Manager
  ; GCN-O1-OPTS-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O1-OPTS-NEXT:       Expand Atomic instructions
  ; GCN-O1-OPTS-NEXT:     AMDGPU Lower Intrinsics
  ; GCN-O1-OPTS-NEXT:     AMDGPU Inline All Functions
  ; GCN-O1-OPTS-NEXT:     CallGraph Construction
@@ -436,6 +435,7 @@
  ; GCN-O1-OPTS-NEXT:     Lower uses of LDS variables from non-kernel functions
  ; GCN-O1-OPTS-NEXT:     FunctionPass Manager
  ; GCN-O1-OPTS-NEXT:       Infer address spaces
+; GCN-O1-OPTS-NEXT:       Expand Atomic instructions
  ; GCN-O1-OPTS-NEXT:       AMDGPU Promote Alloca
  ; GCN-O1-OPTS-NEXT:       Dominator Tree Construction
  ; GCN-O1-OPTS-NEXT:       SROA
@@ -706,7 +706,6 @@
  ; GCN-O2-NEXT:     Fix function bitcasts for AMDGPU
  ; GCN-O2-NEXT:     FunctionPass Manager
  ; GCN-O2-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O2-NEXT:       Expand Atomic instructions
  ; GCN-O2-NEXT:     AMDGPU Lower Intrinsics
  ; GCN-O2-NEXT:     AMDGPU Inline All Functions
  ; GCN-O2-NEXT:     CallGraph Construction
@@ -717,6 +716,7 @@
  ; GCN-O2-NEXT:     Lower uses of LDS variables from non-kernel functions
  ; GCN-O2-NEXT:     FunctionPass Manager
  ; GCN-O2-NEXT:       Infer address spaces
+; GCN-O2-NEXT:       Expand Atomic instructions
  ; GCN-O2-NEXT:       AMDGPU Promote Alloca
  ; GCN-O2-NEXT:       Dominator Tree Construction
  ; GCN-O2-NEXT:       SROA
@@ -988,7 +988,6 @@
  ; GCN-O3-NEXT:     Fix function bitcasts for AMDGPU
  ; GCN-O3-NEXT:     FunctionPass Manager
  ; GCN-O3-NEXT:       Early propagate attributes from kernels to functions
-; GCN-O3-NEXT:       Expand Atomic instructions
  ; GCN-O3-NEXT:     AMDGPU Lower Intrinsics
  ; GCN-O3-NEXT:     AMDGPU Inline All Functions
  ; GCN-O3-NEXT:     CallGraph Construction
@@ -999,6 +998,7 @@
  ; GCN-O3-NEXT:     Lower uses of LDS variables from non-kernel functions
  ; GCN-O3-NEXT:     FunctionPass Manager
  ; GCN-O3-NEXT:       Infer address spaces
+; GCN-O3-NEXT:       Expand Atomic instructions
  ; GCN-O3-NEXT:       AMDGPU Promote Alloca
  ; GCN-O3-NEXT:       Dominator Tree Construction
  ; GCN-O3-NEXT:       SROA
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Tue, 6 Jul 2021 21:34:28 +0000 (14:34 -0700)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Tue, 6 Jul 2021 22:53:32 +0000 (15:53 -0700)
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll		patch \| blob \| history