From 98f47131228cc73b5308e3cc6fd70375808594e3 Mon Sep 17 00:00:00 2001 From: hsmahesha Date: Wed, 1 Sep 2021 10:18:21 +0530 Subject: [PATCH] [AMDGPU] Split entry basic block after alloca instructions. While initializing the LDS pointers within entry basic block of kernel(s), make sure that the entry basic block is split after alloca instructions. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D108971 --- .../AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp | 17 ++++-- ...place-lds-by-ptr-split-entry-bb-after-alloca.ll | 61 ++++++++++++++++++++++ 2 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-split-entry-bb-after-alloca.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp index dabb4d0..39e6afd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -191,10 +191,19 @@ class ReplaceLDSUseImpl { if (!BasicBlockEntry.second) return BasicBlockEntry.first->second; - // Split entry basic block of kernel K. - auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt())); - IRBuilder<> Builder(EI); - + // Split entry basic block of kernel K just after alloca. + // + // Find the split point just after alloca. + auto &EBB = K->getEntryBlock(); + auto *EI = &(*(EBB.getFirstInsertionPt())); + BasicBlock::reverse_iterator RIT(EBB.getTerminator()); + while (!isa(*RIT) && (&*RIT != EI)) + ++RIT; + if (isa(*RIT)) + --RIT; + + // Split entry basic block. + IRBuilder<> Builder(&*RIT); Value *Mbcnt = Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {Builder.getInt32(-1), Builder.getInt32(0)}); diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-split-entry-bb-after-alloca.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-split-entry-bb-after-alloca.ll new file mode 100644 index 0000000..80122cf0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-split-entry-bb-after-alloca.ll @@ -0,0 +1,61 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s + +; DESCRIPTION: +; +; There is one lds global defined here, and this lds is used within a single non-kernel +; function, as an operand of nested constant expression, and this non-kernel function is +; reachable from kernel. Hence nested constant expression should to be converted into a +; series of instructons and pointer replacement should take place. +; +; Further the entry basic block of the kernel @k0 contains alloca instruction. Hence the +; entry basic splitting for pointer initialization should happen after alloca. +; + +; Original LDS should exist. +; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 +@used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 + +; Pointers should be created. +; CHECK: @used_only_within_func.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 + +; Pointer replacement code should be added. +define void @f0(i32 %x) { +; CHECK-LABEL: entry: +; CHECK: %0 = load i16, i16 addrspace(3)* @used_only_within_func.ptr, align 2 +; CHECK: %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 +; CHECK: %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* +; CHECK: %3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, i32 0, i32 0 +; CHECK: %4 = addrspacecast i32 addrspace(3)* %3 to i32* +; CHECK: %5 = ptrtoint i32* %4 to i64 +; CHECK: %6 = add i64 %5, %5 +; CHECK: %7 = inttoptr i64 %6 to i32* +; CHECK: store i32 %x, i32* %7, align 4 +; CHECK: ret void +entry: + store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast ([4 x i32] addrspace(3)* @used_only_within_func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4 + ret void +} + +; Pointer initialization code shoud be added +define amdgpu_kernel void @k0() { +; CHECK-LABEL: entry: +; CHECK: %0 = alloca i64, align 8, addrspace(5) +; CHECK: %1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %2 = icmp eq i32 %1, 0 +; CHECK: br i1 %2, label %3, label %4 +; +; CHECK-LABEL: 3: +; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @used_only_within_func to i16), i16 addrspace(3)* @used_only_within_func.ptr, align 2 +; CHECK: br label %4 + +; CHECK-LABEL: 4: +; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: %5 = addrspacecast i64 addrspace(5)* %0 to i64* +; CHECK: call void @f0(i32 0) +; CHECK: ret void +entry: + %0 = alloca i64, align 8, addrspace(5) + %1 = addrspacecast i64 addrspace(5)* %0 to i64* + call void @f0(i32 0) + ret void +} -- 2.7.4