#include "AMDGPU.h"
#include "AMDGPUIntrinsicInfo.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/Debug.h"
class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
DivergenceAnalysis *DA;
+ MemoryDependenceResults *MDR;
+ LoopInfo *LI;
+ DenseMap<Value*, GetElementPtrInst*> noClobberClones;
+ bool isKernelFunc;
public:
static char ID;
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DivergenceAnalysis>();
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
AU.setPreservesAll();
}
void visitBranchInst(BranchInst &I);
void visitLoadInst(LoadInst &I);
-
+ bool isClobberedInFunction(LoadInst * Load);
};
} // End anonymous namespace
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
static void setUniformMetadata(Instruction *I) {
I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {}));
}
+static void setNoClobberMetadata(Instruction *I) {
+ I->setMetadata("amdgpu.noclobber", MDNode::get(I->getContext(), {}));
+}
+
+static void DFS(BasicBlock *Root, SetVector<BasicBlock*> & Set) {
+ for (auto I : predecessors(Root))
+ if (Set.insert(I))
+ DFS(I, Set);
+}
+
+bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
+ // 1. get Loop for the Load->getparent();
+ // 2. if it exists, collect all the BBs from the most outer
+ // loop and check for the writes. If NOT - start DFS over all preds.
+ // 3. Start DFS over all preds from the most outer loop header.
+ SetVector<BasicBlock *> Checklist;
+ BasicBlock *Start = Load->getParent();
+ Checklist.insert(Start);
+ const Value *Ptr = Load->getPointerOperand();
+ const Loop *L = LI->getLoopFor(Start);
+ if (L) {
+ const Loop *P = L;
+ do {
+ L = P;
+ P = P->getParentLoop();
+ } while (P);
+ Checklist.insert(L->block_begin(), L->block_end());
+ Start = L->getHeader();
+ }
+
+ DFS(Start, Checklist);
+ for (auto &BB : Checklist) {
+ BasicBlock::iterator StartIt = (BB == Load->getParent()) ?
+ BasicBlock::iterator(Load) : BB->end();
+ if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr),
+ true, StartIt, BB, Load).isClobber())
+ return true;
+ }
+ return false;
+}
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
if (I.isUnconditional())
Value *Ptr = I.getPointerOperand();
if (!DA->isUniform(Ptr))
return;
+ auto isGlobalLoad = [](LoadInst &Load)->bool {
+ return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+ };
+ // We're tracking up to the Function boundaries
+ // We cannot go beyond because of FunctionPass restrictions
+ // Thus we can ensure that memory not clobbered for memory
+ // operations that live in kernel only.
+ bool NotClobbered = isKernelFunc && !isClobberedInFunction(&I);
+ Instruction *PtrI = dyn_cast<Instruction>(Ptr);
+ if (!PtrI && NotClobbered && isGlobalLoad(I)) {
+ if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
+ // Lookup for the existing GEP
+ if (noClobberClones.count(Ptr)) {
+ PtrI = noClobberClones[Ptr];
+ } else {
+ // Create GEP of the Value
+ Function *F = I.getParent()->getParent();
+ Value *Idx = Constant::getIntegerValue(
+ Type::getInt32Ty(Ptr->getContext()), APInt(64, 0));
+ // Insert GEP at the entry to make it dominate all uses
+ PtrI = GetElementPtrInst::Create(
+ Ptr->getType()->getPointerElementType(), Ptr,
+ ArrayRef<Value*>(Idx), Twine(""), F->getEntryBlock().getFirstNonPHI());
+ }
+ I.replaceUsesOfWith(Ptr, PtrI);
+ }
+ }
- if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
+ if (PtrI) {
setUniformMetadata(PtrI);
-
+ if (NotClobbered)
+ setNoClobberMetadata(PtrI);
+ }
}
bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
if (skipFunction(F))
return false;
- DA = &getAnalysis<DivergenceAnalysis>();
- visit(F);
+ DA = &getAnalysis<DivergenceAnalysis>();
+ MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+ visit(F);
+ noClobberClones.clear();
return true;
}
CFALUBug(false),
HasVertexCache(false),
TexVTXClauseSize(0),
+ ScalarizeGlobal(false),
FeatureDisable(false),
InstrItins(getInstrItineraryForCPU(GPU)),
bool CFALUBug;
bool HasVertexCache;
short TexVTXClauseSize;
+ bool ScalarizeGlobal;
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;
return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
}
+ void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
+ bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+
/// \returns Subtarget's default pair of minimum/maximum flat work group sizes
/// for function \p F, or minimum/maximum flat work group sizes explicitly
/// requested using "amdgpu-flat-work-group-size" attribute attached to
cl::init(true),
cl::Hidden);
+// Option to to control global loads scalarization
+static cl::opt<bool> ScalarizeGlobal(
+ "amdgpu-scalarize-global-loads",
+ cl::desc("Enable global load scalarization"),
+ cl::init(false),
+ cl::Hidden);
+
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
I->setGISelAccessor(*GISel);
}
+ I->setScalarizeGlobalBehavior(ScalarizeGlobal);
+
return I.get();
}
return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
}
+bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
+ const MemSDNode *MemNode = cast<MemSDNode>(N);
+ const Value *Ptr = MemNode->getMemOperand()->getValue();
+ const Instruction *I = dyn_cast<Instruction>(Ptr);
+ return I && I->getMetadata("amdgpu.noclobber");
+}
+
bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
// Flat -> private/local is a simple truncate.
if (isMemOpUniform(Load))
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
- // have the same legalization requires ments as global and private
+ // have the same legalization requirements as global and private
// loads.
//
LLVM_FALLTHROUGH;
- case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS: {
+ if (isMemOpUniform(Load) && isMemOpHasNoClobberedMemOperand(Load))
+ return SDValue();
+ // Non-uniform loads will be selected to MUBUF instructions, so they
+ // have the same legalization requirements as global and private
+ // loads.
+ //
+ }
+ LLVM_FALLTHROUGH;
case AMDGPUAS::FLAT_ADDRESS:
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
MachineFunction &MF) const override;
bool isMemOpUniform(const SDNode *N) const;
+ bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
// Scalar Memory Patterns
//===----------------------------------------------------------------------===//
+
def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
auto Ld = cast<LoadSDNode>(N);
return Ld->getAlignment() >= 4 &&
- Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
- static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
+ ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
+ (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
+ static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
}]>;
def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">;
--- /dev/null
+; RUN: llc -O2 -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s
+
+; uniform loads
+; CHECK-LABEL: @uniform_load
+; CHECK: s_load_dwordx4
+; CHECK-NOT: flat_load_dword
+
+define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) {
+bb:
+ %tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8
+ %tmp3 = fadd float %tmp2, 0.000000e+00
+ %tmp4 = getelementptr inbounds float, float addrspace(1)* %arg, i64 1
+ %tmp5 = load float, float addrspace(1)* %tmp4, align 4, !tbaa !8
+ %tmp6 = fadd float %tmp3, %tmp5
+ %tmp7 = getelementptr inbounds float, float addrspace(1)* %arg, i64 2
+ %tmp8 = load float, float addrspace(1)* %tmp7, align 4, !tbaa !8
+ %tmp9 = fadd float %tmp6, %tmp8
+ %tmp10 = getelementptr inbounds float, float addrspace(1)* %arg, i64 3
+ %tmp11 = load float, float addrspace(1)* %tmp10, align 4, !tbaa !8
+ %tmp12 = fadd float %tmp9, %tmp11
+ %tmp13 = getelementptr inbounds float, float addrspace(1)* %arg1
+ store float %tmp12, float addrspace(1)* %tmp13, align 4, !tbaa !8
+ ret void
+}
+
+; non-uniform loads
+; CHECK-LABEL: @non-uniform_load
+; CHECK: flat_load_dword
+; CHECK-NOT: s_load_dwordx4
+
+define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) #0 {
+bb:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp
+ %tmp3 = load float, float addrspace(1)* %tmp2, align 4, !tbaa !8
+ %tmp4 = fadd float %tmp3, 0.000000e+00
+ %tmp5 = add i32 %tmp, 1
+ %tmp6 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp5
+ %tmp7 = load float, float addrspace(1)* %tmp6, align 4, !tbaa !8
+ %tmp8 = fadd float %tmp4, %tmp7
+ %tmp9 = add i32 %tmp, 2
+ %tmp10 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp9
+ %tmp11 = load float, float addrspace(1)* %tmp10, align 4, !tbaa !8
+ %tmp12 = fadd float %tmp8, %tmp11
+ %tmp13 = add i32 %tmp, 3
+ %tmp14 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp13
+ %tmp15 = load float, float addrspace(1)* %tmp14, align 4, !tbaa !8
+ %tmp16 = fadd float %tmp12, %tmp15
+ %tmp17 = getelementptr inbounds float, float addrspace(1)* %arg1, i32 %tmp
+ store float %tmp16, float addrspace(1)* %tmp17, align 4, !tbaa !8
+ ret void
+}
+
+
+; uniform load dominated by no-alias store - scalarize
+; CHECK-LABEL: @no_memdep_alias_arg
+; CHECK: flat_store_dword
+; CHECK: s_load_dword [[SVAL:s[0-9]+]]
+; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
+; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
+
+define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
+ store i32 0, i32 addrspace(1)* %out0
+ %val = load i32, i32 addrspace(1)* %in
+ store i32 %val, i32 addrspace(1)* %out1
+ ret void
+}
+
+; uniform load dominated by alias store - vector
+; CHECK-LABEL: {{^}}memdep:
+; CHECK: flat_store_dword
+; CHECK: flat_load_dword [[VVAL:v[0-9]+]]
+; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
+define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
+ store i32 0, i32 addrspace(1)* %out0
+ %val = load i32, i32 addrspace(1)* %in
+ store i32 %val, i32 addrspace(1)* %out1
+ ret void
+}
+
+; uniform load from global array
+; CHECK-LABEL: @global_array
+; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]]
+; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0
+; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
+; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
+; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
+
+@A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4
+
+define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) {
+entry:
+ %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
+ %1 = load i32, i32 addrspace(1)* %0, align 4
+ store i32 %1, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+
+; uniform load from global array dominated by alias store
+; CHECK-LABEL: @global_array_alias_store
+; CHECK: flat_store_dword
+; CHECK: v_mov_b32_e32 v[[ADDR_LO:[0-9]+]], s{{[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[ADDR_HI:[0-9]+]], s{{[0-9]+}}
+; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
+; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
+; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
+define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, i32 %n) {
+entry:
+ %gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n
+ store i32 12, i32 addrspace(1) * %gep
+ %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
+ %1 = load i32, i32 addrspace(1)* %0, align 4
+ store i32 %1, i32 addrspace(1)* %out, align 4
+ ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #1 = { nounwind readnone }
+
+!8 = !{!9, !9, i64 0}
+!9 = !{!"float", !10, i64 0}
+!10 = !{!"omnipotent char", !11, i64 0}
+!11 = !{!"Simple C/C++ TBAA"}
--- /dev/null
+; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: %bb11
+
+; Load from %arg in a Loop body has alias store
+
+; CHECK: flat_load_dword
+
+; CHECK-LABEL: %bb20
+; CHECK: flat_store_dword
+
+; #####################################################################
+
+; CHECK-LABEL: %bb22
+
+; Load from %arg has alias store in Loop
+
+; CHECK: flat_load_dword
+
+; #####################################################################
+
+; Load from %arg1 has no-alias store in Loop - arg1[i+1] never alias arg1[i]
+
+; CHECK: s_load_dword
+
+define amdgpu_kernel void @cfg(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 {
+bb:
+ %tmp = sext i32 %arg2 to i64
+ %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp
+ %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4, !tbaa !0
+ %tmp5 = icmp sgt i32 %tmp4, 0
+ br i1 %tmp5, label %bb6, label %bb8
+
+bb6: ; preds = %bb
+ br label %bb11
+
+bb7: ; preds = %bb22
+ br label %bb8
+
+bb8: ; preds = %bb7, %bb
+ %tmp9 = phi i32 [ 0, %bb ], [ %tmp30, %bb7 ]
+ %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp
+ store i32 %tmp9, i32 addrspace(1)* %tmp10, align 4, !tbaa !0
+ ret void
+
+bb11: ; preds = %bb22, %bb6
+ %tmp12 = phi i32 [ %tmp30, %bb22 ], [ 0, %bb6 ]
+ %tmp13 = phi i32 [ %tmp25, %bb22 ], [ 0, %bb6 ]
+ %tmp14 = srem i32 %tmp13, %arg2
+ %tmp15 = sext i32 %tmp14 to i64
+ %tmp16 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp15
+ %tmp17 = load i32, i32 addrspace(1)* %tmp16, align 4, !tbaa !0
+ %tmp18 = icmp sgt i32 %tmp17, 100
+ %tmp19 = sext i32 %tmp13 to i64
+ br i1 %tmp18, label %bb20, label %bb22
+
+bb20: ; preds = %bb11
+ %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp19
+ store i32 0, i32 addrspace(1)* %tmp21, align 4, !tbaa !0
+ br label %bb22
+
+bb22: ; preds = %bb20, %bb11
+ %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp19
+ %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4, !tbaa !0
+ %tmp25 = add nuw nsw i32 %tmp13, 1
+ %tmp26 = sext i32 %tmp25 to i64
+ %tmp27 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp26
+ %tmp28 = load i32, i32 addrspace(1)* %tmp27, align 4, !tbaa !0
+ %tmp29 = add i32 %tmp24, %tmp12
+ %tmp30 = add i32 %tmp29, %tmp28
+ %tmp31 = icmp eq i32 %tmp25, %tmp4
+ br i1 %tmp31, label %bb7, label %bb11
+}
+
+attributes #0 = { "target-cpu"="fiji" }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"int", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}