From d274d64ef45f99387428d80a4f4b81dee91305e8 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 23 Jun 2021 10:21:40 -0700
Subject: [PATCH] [AMDGPU] Check for pointer operand while refining LDS align

Also skips the propagation if alignment is 1.

Differential Revision: https://reviews.llvm.org/D104796
---
 .../lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 13 +++++++----
 .../CodeGen/AMDGPU/lower-kernel-lds-super-align.ll | 25 ++++++++++++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index a3a43bd..f5cd3d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -335,7 +335,7 @@ private:
 
   void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
                            unsigned MaxDepth = 5) {
-    if (!MaxDepth)
+    if (!MaxDepth || A == 1)
       return;
 
     for (User *U : Ptr->users()) {
@@ -344,15 +344,20 @@ private:
         continue;
       }
       if (auto *SI = dyn_cast<StoreInst>(U)) {
-        SI->setAlignment(std::max(A, SI->getAlign()));
+        if (SI->getPointerOperand() == Ptr)
+          SI->setAlignment(std::max(A, SI->getAlign()));
         continue;
       }
       if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
-        AI->setAlignment(std::max(A, AI->getAlign()));
+        // None of atomicrmw operations can work on pointers, but let's
+        // check it anyway in case it will or we will process ConstantExpr.
+        if (AI->getPointerOperand() == Ptr)
+          AI->setAlignment(std::max(A, AI->getAlign()));
         continue;
       }
       if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
-        AI->setAlignment(std::max(A, AI->getAlign()));
+        if (AI->getPointerOperand() == Ptr)
+          AI->setAlignment(std::max(A, AI->getAlign()));
         continue;
       }
       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
index 470177b..5ce7dcb 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
@@ -6,6 +6,7 @@
 ; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [32 x i8] }
 ; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i16, [2 x i8], i16 }
 ; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i64], [32 x i32] }
+; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [2 x i32 addrspace(3)*] }
 
 ; CHECK-NOT: @lds.1
 @lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1
@@ -17,6 +18,9 @@
 ; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16
 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 8
 
+; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 16
+; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 4
+
 ; CHECK-LABEL: @k1
 ; CHECK:  %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0
 ; CHECK:  %2 = addrspacecast i8 addrspace(3)* %1 to i8*
@@ -127,3 +131,24 @@ define amdgpu_kernel void @k3(i64 %x) {
 
   ret void
 }
+
+@lds.6 = internal unnamed_addr addrspace(3) global [2 x i32 addrspace(3)*] undef, align 4
+
+; Check that aligment is not propagated if use is not a pointer operand.
+
+; CHECK-LABEL: @k4
+; SUPER-ALIGN_ON:  store i32 undef, i32 addrspace(3)* %ptr, align 8
+; SUPER-ALIGN_OFF: store i32 undef, i32 addrspace(3)* %ptr, align 4
+; CHECK:           store i32 addrspace(3)* %ptr, i32 addrspace(3)** undef, align 4
+; SUPER-ALIGN_ON:  %val1 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 1, i32 2 monotonic monotonic, align 8
+; SUPER-ALIGN_OFF: %val1 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 1, i32 2 monotonic monotonic, align 4
+; CHECK:           %val2 = cmpxchg volatile i32 addrspace(3)** undef, i32 addrspace(3)* %ptr, i32 addrspace(3)* undef monotonic monotonic, align 4
+define amdgpu_kernel void @k4() {
+  %gep = getelementptr inbounds i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* bitcast ([2 x i32 addrspace(3)*] addrspace(3)* @lds.6 to i32 addrspace(3)* addrspace(3)*), i64 1
+  %ptr = bitcast i32 addrspace(3)* addrspace(3)* %gep to i32 addrspace(3)*
+  store i32 undef, i32 addrspace(3)* %ptr, align 4
+  store i32 addrspace(3)* %ptr, i32 addrspace(3)** undef, align 4
+  %val1 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 1, i32 2 monotonic monotonic, align 4
+  %val2 = cmpxchg volatile i32 addrspace(3)** undef, i32 addrspace(3)* %ptr, i32 addrspace(3)* undef monotonic monotonic, align 4
+  ret void
+}
-- 
2.7.4