From d7b4b76956e2da8b251c4e6121624477e1491211 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve@amd.com>
Date: Thu, 16 Mar 2023 14:35:27 +0100
Subject: [PATCH] [AMDGPU] Handle memset users in PromoteAlloca

Allows allocas with memset users to be promoted.

This is intended to prevent patterns such as `memset(&alloca, 0, sizeof(alloca))` (which I think can be emitted by frontends) from preventing a vectorization of allocas.

Fixes SWDEV-388784

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D146225
---
 llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp    | 25 ++++++
 llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll | 96 +++++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index a7da400..2fe5fbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -384,6 +385,19 @@ struct MemTransferInfo {
   ConstantInt *DestIndex = nullptr;
 };
 
+// Checks if the instruction I is a memset user of the alloca AI that we can
+// deal with. Currently, only non-volatile memsets that affect the whole alloca
+// are handled.
+static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
+                              const DataLayout &DL) {
+  using namespace PatternMatch;
+  // For now we only care about non-volatile memsets that affect the whole type
+  // (start at index 0 and fill the whole alloca).
+  const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
+  return I->getOperand(0) == AI &&
+         match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
+}
+
 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
                                      unsigned MaxVGPRs) {
 
@@ -485,6 +499,12 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
       continue;
     }
 
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
+        MSI && isSupportedMemset(MSI, Alloca, DL)) {
+      WorkList.push_back(Inst);
+      continue;
+    }
+
     if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
       if (TransferInst->isVolatile())
         return false;
@@ -609,6 +629,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
         Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign());
 
         Inst->eraseFromParent();
+      } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+        // Ensure the length parameter of the memsets matches the new vector
+        // type's. In general, the type size shouldn't change so this is a
+        // no-op, but it's better to be safe.
+        MSI->setOperand(2, Builder.getInt64(DL.getTypeStoreSize(VectorTy)));
       } else {
         llvm_unreachable("Unsupported call when promoting alloca to vector");
       }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
new file mode 100644
index 0000000..f31421d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s
+
+; Checks that memsets don't block PromoteAlloca.
+
+; Note: memsets are just updated with the new type size. They are not eliminated which means
+; the original alloca also stay. This puts a bit more load on SROA.
+; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with
+; e.g. ConstantAggregate.
+
+define amdgpu_kernel void @memset_all_zero(i64 %val) {
+; CHECK-LABEL: @memset_all_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [6 x i64], align 4, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 48, i1 false)
+  store i64 %val, ptr addrspace(5) %stack
+  %reload = load i64, ptr addrspace(5) %stack
+  %stack.1 = getelementptr [6 x i64], ptr addrspace(5) %stack, i64 0, i64 1
+  store i64 %val, ptr addrspace(5) %stack.1
+  ret void
+}
+
+define amdgpu_kernel void @memset_all_5(i64 %val) {
+; CHECK-LABEL: @memset_all_5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 5, i64 32, i1 false)
+  store i64 %val, ptr addrspace(5) %stack
+  %reload = load i64, ptr addrspace(5) %stack
+  %stack.1 = getelementptr [6 x i64], ptr addrspace(5) %stack, i64 0, i64 1
+  store i64 %val, ptr addrspace(5) %stack.1
+  ret void
+}
+
+define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
+; CHECK-LABEL: @memset_volatile_nopromote(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
+; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 32, i1 true)
+  store i64 %val, ptr addrspace(5) %stack
+  ret void
+}
+
+define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
+; CHECK-LABEL: @memset_badsize_nopromote(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
+; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 31, i1 true)
+  store i64 %val, ptr addrspace(5) %stack
+  ret void
+}
+
+define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
+; CHECK-LABEL: @memset_offset_ptr_nopromote(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  %gep = getelementptr [4 x i64], ptr addrspace(5) %stack, i64 0, i64 1
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %gep, i8 0, i64 24, i1 true)
+  store i64 %val, ptr addrspace(5) %stack
+  ret void
+}
+
+declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)
-- 
2.7.4