From d7b4b76956e2da8b251c4e6121624477e1491211 Mon Sep 17 00:00:00 2001 From: pvanhout Date: Thu, 16 Mar 2023 14:35:27 +0100 Subject: [PATCH] [AMDGPU] Handle memset users in PromoteAlloca Allows allocas with memset users to be promoted. This is intended to prevent patterns such as `memset(&alloca, 0, sizeof(alloca))` (which I think can be emitted by frontends) from preventing a vectorization of allocas. Fixes SWDEV-388784 Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D146225 --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 25 ++++++ llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll | 96 +++++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index a7da400..2fe5fbe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" @@ -384,6 +385,19 @@ struct MemTransferInfo { ConstantInt *DestIndex = nullptr; }; +// Checks if the instruction I is a memset user of the alloca AI that we can +// deal with. Currently, only non-volatile memsets that affect the whole alloca +// are handled. +static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, + const DataLayout &DL) { + using namespace PatternMatch; + // For now we only care about non-volatile memsets that affect the whole type + // (start at index 0 and fill the whole alloca). + const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType()); + return I->getOperand(0) == AI && + match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile(); +} + static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, unsigned MaxVGPRs) { @@ -485,6 +499,12 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, continue; } + if (MemSetInst *MSI = dyn_cast(Inst); + MSI && isSupportedMemset(MSI, Alloca, DL)) { + WorkList.push_back(Inst); + continue; + } + if (MemTransferInst *TransferInst = dyn_cast(Inst)) { if (TransferInst->isVolatile()) return false; @@ -609,6 +629,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign()); Inst->eraseFromParent(); + } else if (MemSetInst *MSI = dyn_cast(Inst)) { + // Ensure the length parameter of the memsets matches the new vector + // type's. In general, the type size shouldn't change so this is a + // no-op, but it's better to be safe. + MSI->setOperand(2, Builder.getInt64(DL.getTypeStoreSize(VectorTy))); } else { llvm_unreachable("Unsupported call when promoting alloca to vector"); } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll new file mode 100644 index 0000000..f31421d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s + +; Checks that memsets don't block PromoteAlloca. + +; Note: memsets are just updated with the new type size. They are not eliminated which means +; the original alloca also stay. This puts a bit more load on SROA. +; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with +; e.g. ConstantAggregate. + +define amdgpu_kernel void @memset_all_zero(i64 %val) { +; CHECK-LABEL: @memset_all_zero( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1 +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [6 x i64], align 4, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 48, i1 false) + store i64 %val, ptr addrspace(5) %stack + %reload = load i64, ptr addrspace(5) %stack + %stack.1 = getelementptr [6 x i64], ptr addrspace(5) %stack, i64 0, i64 1 + store i64 %val, ptr addrspace(5) %stack.1 + ret void +} + +define amdgpu_kernel void @memset_all_5(i64 %val) { +; CHECK-LABEL: @memset_all_5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> , i64 [[VAL:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1 +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 5, i64 32, i1 false) + store i64 %val, ptr addrspace(5) %stack + %reload = load i64, ptr addrspace(5) %stack + %stack.1 = getelementptr [6 x i64], ptr addrspace(5) %stack, i64 0, i64 1 + store i64 %val, ptr addrspace(5) %stack.1 + ret void +} + +define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) { +; CHECK-LABEL: @memset_volatile_nopromote( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5) +; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true) +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8 +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 32, i1 true) + store i64 %val, ptr addrspace(5) %stack + ret void +} + +define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) { +; CHECK-LABEL: @memset_badsize_nopromote( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5) +; CHECK-NEXT: [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true) +; CHECK-NEXT: store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8 +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 31, i1 true) + store i64 %val, ptr addrspace(5) %stack + ret void +} + +define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) { +; CHECK-LABEL: @memset_offset_ptr_nopromote( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5) +; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true) +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %gep = getelementptr [4 x i64], ptr addrspace(5) %stack, i64 0, i64 1 + call void @llvm.memset.p5.i64(ptr addrspace(5) %gep, i8 0, i64 24, i1 true) + store i64 %val, ptr addrspace(5) %stack + ret void +} + +declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) -- 2.7.4