From cb5b52a06eeb7cc868944bb08f71fffe13f33412 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Tue, 5 Jan 2021 14:47:19 -0800 Subject: [PATCH] AMDGPU: Annotate amdgpu.noclobber for global loads only Summary: This is to avoid unnecessary analysis since amdgpu.noclobber is only used for globals. Reviewers: arsenm Fixes: SWDEV-239161 Differential Revision: https://reviews.llvm.org/D94107 --- .../Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp | 5 ++- llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll | 47 ++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll | 18 ++++----- 3 files changed, 59 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index f23c4c1..0123450 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -142,10 +142,11 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { } bool NotClobbered = false; + bool GlobalLoad = isGlobalLoad(I); if (PtrI) - NotClobbered = !isClobberedInFunction(&I); + NotClobbered = GlobalLoad && !isClobberedInFunction(&I); else if (isa(Ptr) || isa(Ptr)) { - if (isGlobalLoad(I) && !isClobberedInFunction(&I)) { + if (GlobalLoad && !isClobberedInFunction(&I)) { NotClobbered = true; // Lookup for the existing GEP if (noClobberClones.count(Ptr)) { diff --git a/llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll b/llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll new file mode 100644 index 0000000..580ea20 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll @@ -0,0 +1,47 @@ +; RUN: opt -S --amdgpu-annotate-uniform < %s | FileCheck -check-prefix=OPT %s +target datalayout = "A5" + + +; OPT-LABEL: @amdgpu_noclobber_global( +; OPT: %addr = getelementptr i32, i32 addrspace(1)* %in, i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; OPT-NEXT: %load = load i32, i32 addrspace(1)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_global( i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(1)* %in, i64 0 + %load = load i32, i32 addrspace(1)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +; OPT-LABEL: @amdgpu_noclobber_local( +; OPT: %addr = getelementptr i32, i32 addrspace(3)* %in, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %load = load i32, i32 addrspace(3)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_local( i32 addrspace(3)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(3)* %in, i64 0 + %load = load i32, i32 addrspace(3)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +; OPT-LABEL: @amdgpu_noclobber_private( +; OPT: %addr = getelementptr i32, i32 addrspace(5)* %in, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %load = load i32, i32 addrspace(5)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_private( i32 addrspace(5)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(5)* %in, i64 0 + %load = load i32, i32 addrspace(5)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +; OPT-LABEL: @amdgpu_noclobber_flat( +; OPT: %addr = getelementptr i32, i32 addrspace(4)* %in, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %load = load i32, i32 addrspace(4)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_flat( i32 addrspace(4)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(4)* %in, i64 0 + %load = load i32, i32 addrspace(4)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll b/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll index 88cfffe..fbf6990 100644 --- a/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll +++ b/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll @@ -4,24 +4,24 @@ target datalayout = "A5" ; "load vaddr" depends on the store, so we should not mark vaddr as amdgpu.noclobber. ; OPT-LABEL: @store_clobbers_load( -; OPT: %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)*, !amdgpu.uniform !0 -; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16 -define amdgpu_kernel void @store_clobbers_load(i32 addrspace(1)* %out, i32 %index) { +; OPT: %vaddr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %input, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr, align 16 +define amdgpu_kernel void @store_clobbers_load( < 4 x i32> addrspace(1)* %input, i32 addrspace(1)* %out, i32 %index) { entry: - %alloca = alloca [4 x i32], addrspace(5) - %addr0 = bitcast [4 x i32] addrspace(5)* %alloca to i32 addrspace(5)* - store i32 0, i32 addrspace(5)* %addr0 - %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)* - %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16 + %addr0 = bitcast <4 x i32> addrspace(1)* %input to i32 addrspace(1)* + store i32 0, i32 addrspace(1)* %addr0 + %vaddr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %input, i64 0 + %zero = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr, align 16 %one = insertelement <4 x i32> %zero, i32 1, i32 1 %two = insertelement <4 x i32> %one, i32 2, i32 2 %three = insertelement <4 x i32> %two, i32 3, i32 3 - store <4 x i32> %three, <4 x i32> addrspace(5)* %vaddr, align 16 + store <4 x i32> %three, <4 x i32> addrspace(1)* %input, align 16 %rslt = extractelement <4 x i32> %three, i32 %index store i32 %rslt, i32 addrspace(1)* %out, align 4 ret void } + declare i32 @llvm.amdgcn.workitem.id.x() @lds0 = addrspace(3) global [512 x i32] undef, align 4 -- 2.7.4