[AMDGPU] Tune perfhint analysis to account access width

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Thu, 8 Jul 2021 19:23:52 +0000 (12:23 -0700)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Wed, 21 Jul 2021 19:46:10 +0000 (12:46 -0700)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Thu, 8 Jul 2021 19:23:52 +0000 (12:23 -0700)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Wed, 21 Jul 2021 19:46:10 +0000 (12:46 -0700)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp

index 9d65653..2aa0229 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -209,19 +209,22 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
    for (auto &B : F) {
      LastAccess = MemAccessInfo();
      for (auto &I : B) {
-      if (getMemoryInstrPtr(&I)) {
+      if (const Value *Ptr = getMemoryInstrPtr(&I)) {
+        unsigned Size = divideCeil(
+            Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+            32);
          if (isIndirectAccess(&I))
-          ++FI.IAMInstCount;
+          FI.IAMInstCost += Size;
          if (isLargeStride(&I))
-          ++FI.LSMInstCount;
-        ++FI.MemInstCount;
-        ++FI.InstCount;
+          FI.LSMInstCost += Size;
+        FI.MemInstCost += Size;
+        FI.InstCost += Size;
          continue;
        }
        if (auto *CB = dyn_cast<CallBase>(&I)) {
          Function *Callee = CB->getCalledFunction();
          if (!Callee || Callee->isDeclaration()) {
-          ++FI.InstCount;
+          ++FI.InstCost;
            continue;
          }
          if (&F == Callee) // Handle immediate recursion
@@ -231,10 +234,10 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
          if (Loc == FIM.end())
            continue;
  
-        FI.MemInstCount += Loc->second.MemInstCount;
-        FI.InstCount += Loc->second.InstCount;
-        FI.IAMInstCount += Loc->second.IAMInstCount;
-        FI.LSMInstCount += Loc->second.LSMInstCount;
+        FI.MemInstCost += Loc->second.MemInstCost;
+        FI.InstCost += Loc->second.InstCost;
+        FI.IAMInstCost += Loc->second.IAMInstCost;
+        FI.LSMInstCost += Loc->second.LSMInstCost;
        } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
          TargetLoweringBase::AddrMode AM;
          auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
@@ -244,9 +247,9 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
                                         GEP->getPointerAddressSpace()))
            // Offset will likely be folded into load or store
            continue;
-        ++FI.InstCount;
+        ++FI.InstCost;
        } else {
-        ++FI.InstCount;
+        ++FI.InstCost;
        }
      }
    }
@@ -264,11 +267,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
  
    const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
  
-  LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
+  LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
                      << '\n'
-                    << " IAMInst: " << Info->IAMInstCount << '\n'
-                    << " LSMInst: " << Info->LSMInstCount << '\n'
-                    << " TotalInst: " << Info->InstCount << '\n');
+                    << " IAMInst cost: " << Info->IAMInstCost << '\n'
+                    << " LSMInst cost: " << Info->LSMInstCost << '\n'
+                    << " TotalInst cost: " << Info->InstCost << '\n');
  
    if (isMemBound(*Info)) {
      LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
@@ -286,13 +289,12 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
  }
  
  bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
-  return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+  return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
  }
  
  bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
-  return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
-           FI.LSMInstCount * LSWeight) *
-          100 / FI.InstCount) > LimitWaveThresh;
+  return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
+           FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
  }
  
  bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h

index 99dbf50..31ff80f 100644 (file)
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -37,12 +37,11 @@ public:
    bool needsWaveLimiter(const Function *F) const;
  
    struct FuncInfo {
-    unsigned MemInstCount;
-    unsigned InstCount;
-    unsigned IAMInstCount; // Indirect access memory instruction count
-    unsigned LSMInstCount; // Large stride memory instruction count
-    FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
-                 LSMInstCount(0) {}
+    unsigned MemInstCost;
+    unsigned InstCost;
+    unsigned IAMInstCost; // Indirect access memory instruction count
+    unsigned LSMInstCost; // Large stride memory instruction count
+    FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
    };
  
    typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll

index 1fef142..89f4fae 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/perfhint.ll
+++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll
@@ -16,16 +16,6 @@ bb:
    %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
    %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
    store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
-  %tmp10 = add nuw nsw i64 %tmp2, 2
-  %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
-  %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
-  %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
-  store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
-  %tmp14 = add nuw nsw i64 %tmp2, 3
-  %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
-  %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
-  %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
-  store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
    ret void
  }
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Thu, 8 Jul 2021 19:23:52 +0000 (12:23 -0700)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Wed, 21 Jul 2021 19:46:10 +0000 (12:46 -0700)
llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp		patch \| blob \| history
llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/perfhint.ll		patch \| blob \| history