for (auto &B : F) {
LastAccess = MemAccessInfo();
for (auto &I : B) {
- if (getMemoryInstrPtr(&I)) {
+ if (const Value *Ptr = getMemoryInstrPtr(&I)) {
+ unsigned Size = divideCeil(
+ Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+ 32);
if (isIndirectAccess(&I))
- ++FI.IAMInstCount;
+ FI.IAMInstCost += Size;
if (isLargeStride(&I))
- ++FI.LSMInstCount;
- ++FI.MemInstCount;
- ++FI.InstCount;
+ FI.LSMInstCost += Size;
+ FI.MemInstCost += Size;
+ FI.InstCost += Size;
continue;
}
if (auto *CB = dyn_cast<CallBase>(&I)) {
Function *Callee = CB->getCalledFunction();
if (!Callee || Callee->isDeclaration()) {
- ++FI.InstCount;
+ ++FI.InstCost;
continue;
}
if (&F == Callee) // Handle immediate recursion
if (Loc == FIM.end())
continue;
- FI.MemInstCount += Loc->second.MemInstCount;
- FI.InstCount += Loc->second.InstCount;
- FI.IAMInstCount += Loc->second.IAMInstCount;
- FI.LSMInstCount += Loc->second.LSMInstCount;
+ FI.MemInstCost += Loc->second.MemInstCost;
+ FI.InstCost += Loc->second.InstCost;
+ FI.IAMInstCost += Loc->second.IAMInstCost;
+ FI.LSMInstCost += Loc->second.LSMInstCost;
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
TargetLoweringBase::AddrMode AM;
auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
GEP->getPointerAddressSpace()))
// Offset will likely be folded into load or store
continue;
- ++FI.InstCount;
+ ++FI.InstCost;
} else {
- ++FI.InstCount;
+ ++FI.InstCost;
}
}
}
const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
- LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount
+ LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
<< '\n'
- << " IAMInst: " << Info->IAMInstCount << '\n'
- << " LSMInst: " << Info->LSMInstCount << '\n'
- << " TotalInst: " << Info->InstCount << '\n');
+ << " IAMInst cost: " << Info->IAMInstCost << '\n'
+ << " LSMInst cost: " << Info->LSMInstCost << '\n'
+ << " TotalInst cost: " << Info->InstCost << '\n');
if (isMemBound(*Info)) {
LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
}
bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
- return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh;
+ return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
}
bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
- return ((FI.MemInstCount + FI.IAMInstCount * IAWeight +
- FI.LSMInstCount * LSWeight) *
- 100 / FI.InstCount) > LimitWaveThresh;
+ return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
+ FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
}
bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
bool needsWaveLimiter(const Function *F) const;
struct FuncInfo {
- unsigned MemInstCount;
- unsigned InstCount;
- unsigned IAMInstCount; // Indirect access memory instruction count
- unsigned LSMInstCount; // Large stride memory instruction count
- FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0),
- LSMInstCount(0) {}
+ unsigned MemInstCost;
+ unsigned InstCost;
+ unsigned IAMInstCost; // Indirect access memory instruction count
+ unsigned LSMInstCost; // Large stride memory instruction count
+ FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
};
typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
%tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16
%tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6
store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16
- %tmp10 = add nuw nsw i64 %tmp2, 2
- %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10
- %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16
- %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10
- store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16
- %tmp14 = add nuw nsw i64 %tmp2, 3
- %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14
- %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16
- %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14
- store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16
ret void
}