[AMDGPU] Use performOptimizedStructLayout for LDS sort

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Tue, 15 Jun 2021 21:51:59 +0000 (14:51 -0700)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Tue, 22 Jun 2021 16:58:10 +0000 (09:58 -0700)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Tue, 15 Jun 2021 21:51:59 +0000 (14:51 -0700)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Tue, 22 Jun 2021 16:58:10 +0000 (09:58 -0700)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp

index 48e3ad6..30acbc9 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -46,8 +46,8 @@
  #include "llvm/Pass.h"
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Support/Debug.h"
+#include "llvm/Support/OptimizedStructLayout.h"
  #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <algorithm>
  #include <vector>
  
  #define DEBUG_TYPE "amdgpu-lower-module-lds"
@@ -210,35 +210,25 @@ private:
        }
      }
  
-    // Sort by alignment, descending, to minimise padding.
-    // On ties, sort by size, descending, then by name, lexicographical.
-    llvm::stable_sort(
-        FoundLocalVars,
-        [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
-          Align ALHS = AMDGPU::getAlign(DL, LHS);
-          Align ARHS = AMDGPU::getAlign(DL, RHS);
-          if (ALHS != ARHS) {
-            return ALHS > ARHS;
-          }
-
-          TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
-          TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
-          if (SLHS != SRHS) {
-            return SLHS > SRHS;
-          }
-
-          // By variable name on tie for predictable order in test cases.
-          return LHS->getName() < RHS->getName();
-        });
+    SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
+    LayoutFields.reserve(FoundLocalVars.size());
+    for (GlobalVariable *GV : FoundLocalVars) {
+      OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
+                                   AMDGPU::getAlign(DL, GV));
+      LayoutFields.emplace_back(F);
+    }
+
+    performOptimizedStructLayout(LayoutFields);
  
      std::vector<GlobalVariable *> LocalVars;
      LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
      {
        // This usually won't need to insert any padding, perhaps avoid the alloc
        uint64_t CurrentOffset = 0;
-      for (size_t I = 0; I < FoundLocalVars.size(); I++) {
-        GlobalVariable *FGV = FoundLocalVars[I];
-        Align DataAlign = AMDGPU::getAlign(DL, FGV);
+      for (size_t I = 0; I < LayoutFields.size(); I++) {
+        GlobalVariable *FGV = static_cast<GlobalVariable *>(
+            const_cast<void *>(LayoutFields[I].Id));
+        Align DataAlign = LayoutFields[I].Alignment;
  
          uint64_t DataAlignV = DataAlign.value();
          if (uint64_t Rem = CurrentOffset % DataAlignV) {
@@ -257,7 +247,7 @@ private:
          }
  
          LocalVars.push_back(FGV);
-        CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
+        CurrentOffset += LayoutFields[I].Size;
        }
      }
  
@@ -272,14 +262,14 @@ private:
            : "llvm.amdgcn.module.lds");
      StructType *LDSTy = StructType::create(Ctx, LocalVarTypes, VarName + ".t");
  
-    Align MaxAlign =
-        AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment
+    Align StructAlign =
+        AMDGPU::getAlign(DL, LocalVars[0]);
  
      GlobalVariable *SGV = new GlobalVariable(
          M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
          VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
          false);
-    SGV->setAlignment(MaxAlign);
+    SGV->setAlignment(StructAlign);
      if (!F) {
        appendToCompilerUsed(
            M, {static_cast<GlobalValue *>(
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll

index 948c07d..f7070a6 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -20,7 +20,7 @@
  ; CHECK: @llvm.amdgcn.kernel.timestwo.lds = internal addrspace(3) global %llvm.amdgcn.kernel.timestwo.lds.t undef, align 4
  
  ; CHECK-LABEL: @get_func()
-; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
  define i32 @get_func() local_unnamed_addr #0 {
  entry:
    %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -28,7 +28,7 @@ entry:
  }
  
  ; CHECK-LABEL: @set_func(i32 %x)
-; CHECK:  store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
  define void @set_func(i32 %x) local_unnamed_addr #1 {
  entry:
    store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -40,14 +40,14 @@ entry:
  ; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
  ; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
  ; CHECK: %3 = ptrtoint i32* %2 to i64
-; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), %3
+; CHECK: %4 = add i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), %3
  ; CHECK: %5 = inttoptr i64 %4 to i32*
  ; CHECK: %ld = load i32, i32* %5, align 4
  ; CHECK: %mul = mul i32 %ld, 2
  ; CHECK: %6 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
  ; CHECK: %7 = addrspacecast i32 addrspace(3)* %6 to i32*
  ; CHECK: %8 = ptrtoint i32* %7 to i64
-; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)
+; CHECK: %9 = add i64 %8, ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)
  ; CHECK: %10 = inttoptr i64 %9 to i32*
  ; CHECK: store i32 %mul, i32* %10, align 4
  define amdgpu_kernel void @timestwo() {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll

index 0ce2aba..104c877 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll
@@ -16,7 +16,7 @@
  ;          But none of them are used anywhere. Hence, @lds.6 is not lowered.
  ;.
  
-; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.module.lds.t = type { [4 x i8], [3 x i8], [1 x i8], [2 x i8] }
  
  ; CHECK-NOT: @lds.1
  ; CHECK-NOT: @lds.2
@@ -41,7 +41,7 @@
  ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
  ; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0, i32 0) to i8*)], section "llvm.metadata"
  
-; CHECK: @alias.to.lds.1 = alias [1 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 4)
+; CHECK: @alias.to.lds.1 = alias [1 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2)
  ; CHECK: @alias.to.lds.2 = alias [2 x i8], getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3)
  ; CHECK: @alias.to.gptr.3 = alias i64*, i64* addrspace(1)* @gptr.3
  ; CHECK: @alias.to.gptr.4 = alias i64*, i64* addrspace(1)* @gptr.4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll

index 7985ecf..77fcefa 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll
@@ -17,7 +17,7 @@
  ;          and @gptr.8 is used within non-kernel function @f1. Hence @lds.7 is lowered.
  ;.
  
-; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [4 x i8], [2 x float], [1 x float] }
+; CHECK: %llvm.amdgcn.module.lds.t = type { [3 x float], [1 x float], [2 x float] }
  
  ; CHECK: @lds.1 = addrspace(3) global i16 undef, align 2
  ; CHECK: @lds.2 = addrspace(3) global i32 undef, align 4
@@ -36,7 +36,7 @@
  
  ; CHECK: @gptr.3 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* @lds.3 to i64*), align 8
  ; CHECK: @gptr.4 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (float addrspace(3)* @lds.4 to i64 addrspace(3)*) to i64*), align 8
-; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 3) to i64 addrspace(3)*) to i64*), align 8
+; CHECK: @gptr.5 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([1 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i64 addrspace(3)*) to i64*), align 8
  ; CHECK: @gptr.6 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast ([2 x float] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2) to i64 addrspace(3)*) to i64*), align 8
  ; CHECK: @gptr.7 = addrspace(1) global i64* addrspacecast (i64 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i64 addrspace(3)*) to i64*), align 8
  ; CHECK: @gptr.8 = addrspace(1) global i64** addrspacecast (i64* addrspace(1)* @gptr.7 to i64**), align 8
diff --git a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll

index 1678344..6c5669e 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll
@@ -5,10 +5,10 @@
  ; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [8 x i8], [4 x i8], [2 x i8], [1 x i8] }
  
  ; Different properly aligned values, but same size of 1.
-; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [7 x i8], [1 x i8], [3 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8], [3 x i8], [1 x i8] }
  
  ; All are under-aligned, requires to fix each on different alignment boundary.
-; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [7 x i8], [5 x i8], [3 x i8], [3 x i8], [1 x i8], [2 x i8] }
+; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [1 x i8], [2 x i8], [3 x i8], [1 x i8], [5 x i8] }
  
  ; All LDS are underaligned, requires to allocate on 8 byte boundary
  ; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [7 x i8], [1 x i8], [7 x i8], [1 x i8], [6 x i8], [2 x i8], [5 x i8] }
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Tue, 15 Jun 2021 21:51:59 +0000 (14:51 -0700)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Tue, 22 Jun 2021 16:58:10 +0000 (09:58 -0700)
llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/lower-module-lds-global-alias.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/lower-module-lds-global-uses.ll		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/update-lds-alignment.ll		patch \| blob \| history