return ST.hasApertureRegs();
}
+ std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return ST.getFlatWorkGroupSizes(F);
+ }
+
+ std::pair<unsigned, unsigned>
+ getMaximumFlatWorkGroupRange(const Function &F) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
+ }
+
private:
/// Check if the ConstantExpr \p CE requires queue ptr attribute.
static bool visitConstExpr(const ConstantExpr *CE) {
llvm_unreachable("AAAMDAttributes is only valid for function position");
}
+/// Propagate amdgpu-flat-work-group-size attribute.
+struct AAAMDFlatWorkGroupSize
+ : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
+ using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
+ AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
+ : Base(IRP, 32) {}
+
+ /// See AbstractAttribute::getState(...).
+ IntegerRangeState &getState() override { return *this; }
+ const IntegerRangeState &getState() const override { return *this; }
+
+ void initialize(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+ unsigned MinGroupSize, MaxGroupSize;
+ std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
+ intersectKnown(
+ ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+ auto CheckCallSite = [&](AbstractCallSite CS) {
+ Function *Caller = CS.getInstruction()->getFunction();
+ LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
+ << "->" << getAssociatedFunction()->getName() << '\n');
+
+ const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
+ *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+
+ Change |=
+ clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
+
+ return true;
+ };
+
+ bool AllCallSitesKnown = true;
+ if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
+ return indicatePessimisticFixpoint();
+
+ return Change;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ SmallVector<Attribute, 8> AttrList;
+ Function *F = getAssociatedFunction();
+ LLVMContext &Ctx = F->getContext();
+
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+ unsigned Min, Max;
+ std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
+
+ // Don't add the attribute if it's the implied default.
+ if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
+ return ChangeStatus::UNCHANGED;
+
+ SmallString<10> Buffer;
+ raw_svector_ostream OS(Buffer);
+ OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
+
+ AttrList.push_back(
+ Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
+ return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
+ /* ForceReplace */ true);
+ }
+
+ const std::string getAsStr() const override {
+ std::string Str;
+ raw_string_ostream OS(Str);
+ OS << "AMDFlatWorkGroupSize[";
+ OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
+ OS << ']';
+ return OS.str();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
+ Attributor &A);
+
+ /// See AbstractAttribute::getName()
+ const std::string getName() const override {
+ return "AAAMDFlatWorkGroupSize";
+ }
+
+ /// See AbstractAttribute::getIdAddr()
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is
+ /// AAAMDFlatWorkGroupSize
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+
+const char AAAMDFlatWorkGroupSize::ID = 0;
+
+AAAMDFlatWorkGroupSize &
+AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+ return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
+ llvm_unreachable(
+ "AAAMDFlatWorkGroupSize is only valid for function position");
+}
+
class AMDGPUAttributor : public ModulePass {
public:
AMDGPUAttributor() : ModulePass(ID) {}
BumpPtrAllocator Allocator;
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
DenseSet<const char *> Allowed(
- {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, &AACallEdges::ID});
+ {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
+ &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID});
Attributor A(Functions, InfoCache, CGUpdater, &Allowed);
if (!F.isIntrinsic()) {
A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
+ if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
+ A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
+ }
}
}
// Attributes to propagate.
// TODO: Support conservative min/max merging instead of cloning.
-static constexpr const char* AttributeNames[] = {
- "amdgpu-waves-per-eu",
- "amdgpu-flat-work-group-size"
-};
+static constexpr const char *AttributeNames[] = {"amdgpu-waves-per-eu"};
static constexpr unsigned NumAttr =
sizeof(AttributeNames) / sizeof(AttributeNames[0]);
+++ /dev/null
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-propagate-attributes-late %s | FileCheck %s
-
-; CHECK: define internal void @max_flat_1_1024() #0 {
-define internal void @max_flat_1_1024() #0 {
- ret void
-}
-
-; CHECK: define internal void @max_flat_1_256() #1 {
-define internal void @max_flat_1_256() #1 {
- ret void
-}
-
-; CHECK: define amdgpu_kernel void @kernel_1_256_call_default() #1 {
-define amdgpu_kernel void @kernel_1_256_call_default() #1 {
- call void @default()
- ret void
-}
-
-; CHECK: define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
-define amdgpu_kernel void @kernel_1_256_call_1_256() #1 {
- call void @max_flat_1_256()
- ret void
-}
-
-; CHECK: define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
-define amdgpu_kernel void @kernel_1_256_call_64_64() #1 {
- call void @max_flat_64_64()
- ret void
-}
-
-; CHECK: define internal void @max_flat_64_64() #2 {
-define internal void @max_flat_64_64() #2 {
- ret void
-}
-
-; CHECK: define internal void @default() #2 {
-define internal void @default() #3 {
- ret void
-}
-
-attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024" }
-attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256" }
-attributes #2 = { noinline "amdgpu-flat-work-group-size"="64,64" }
-attributes #3 = { noinline }
-
-; CHECK: attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024"
-; CHECK-NEXT: attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256"
-; CHECK-NEXT: attributes #2 = { noinline "amdgpu-flat-work-group-size"="1,256"
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
+
+; Check propagation of amdgpu-flat-work-group-size attribute.
+
+; Called from a single kernel with 1,256
+define internal void @default_to_1_256() {
+; CHECK-LABEL: define {{[^@]+}}@default_to_1_256
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define amdgpu_kernel void @kernel_1_256() #0 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_1_256
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT: call void @default_to_1_256()
+; CHECK-NEXT: ret void
+;
+ call void @default_to_1_256()
+ ret void
+}
+
+; Called from a single kernel with 64,128
+define internal void @default_to_64_128() {
+; CHECK-LABEL: define {{[^@]+}}@default_to_64_128
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define amdgpu_kernel void @kernel_64_128() #1 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_64_128
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT: call void @default_to_64_128()
+; CHECK-NEXT: call void @flat_group_64_64()
+; CHECK-NEXT: call void @default_to_64_256()
+; CHECK-NEXT: call void @flat_group_128_256()
+; CHECK-NEXT: ret void
+;
+ call void @default_to_64_128()
+ call void @flat_group_64_64()
+ call void @default_to_64_256()
+ call void @flat_group_128_256()
+ ret void
+}
+
+; Called from kernels with 128,512 and 512,512
+define internal void @default_to_128_512() {
+; CHECK-LABEL: define {{[^@]+}}@default_to_128_512
+; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+; This already has a strict bounds, but called from kernels with wider
+; bounds, and should not be changed.
+define internal void @flat_group_64_64() #2 {
+; CHECK-LABEL: define {{[^@]+}}@flat_group_64_64
+; CHECK-SAME: () #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+; 128,256 -> 128,128
+define internal void @flat_group_128_256() #3 {
+; CHECK-LABEL: define {{[^@]+}}@flat_group_128_256
+; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define internal void @flat_group_512_1024() #4 {
+; CHECK-LABEL: define {{[^@]+}}@flat_group_512_1024
+; CHECK-SAME: () #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define amdgpu_kernel void @kernel_128_512() #5 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_128_512
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT: call void @default_to_128_512()
+; CHECK-NEXT: call void @flat_group_64_64()
+; CHECK-NEXT: ret void
+;
+ call void @default_to_128_512()
+ call void @flat_group_64_64()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_512_512() #6 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_512_512
+; CHECK-SAME: () #[[ATTR5]] {
+; CHECK-NEXT: call void @default_to_128_512()
+; CHECK-NEXT: call void @flat_group_512_1024()
+; CHECK-NEXT: ret void
+;
+ call void @default_to_128_512()
+ call void @flat_group_512_1024()
+ ret void
+}
+
+; Called from kernels with 128,256 and 64,128 => 64,256
+define internal void @default_to_64_256() {
+; CHECK-LABEL: define {{[^@]+}}@default_to_64_256
+; CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+; The kernel's lower bound is higher than the callee's lower bound, so
+; this should probably be illegal.
+define amdgpu_kernel void @kernel_128_256() #3 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_128_256
+; CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+; CHECK-NEXT: call void @default_to_64_256()
+; CHECK-NEXT: ret void
+;
+ call void @default_to_64_256()
+ ret void
+}
+
+; 64,128 -> 64,128
+define internal void @merge_cycle_0() #1 {
+; CHECK-LABEL: define {{[^@]+}}@merge_cycle_0
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT: call void @merge_cycle_1()
+; CHECK-NEXT: ret void
+;
+ call void @merge_cycle_1()
+ ret void
+}
+
+; 128,256 -> 128,128
+define internal void @merge_cycle_1() #3 {
+; CHECK-LABEL: define {{[^@]+}}@merge_cycle_1
+; CHECK-SAME: () #[[ATTR4]] {
+; CHECK-NEXT: call void @merge_cycle_0()
+; CHECK-NEXT: ret void
+;
+ call void @merge_cycle_0()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_64_256() #7 {
+; CHECK-LABEL: define {{[^@]+}}@kernel_64_256
+; CHECK-SAME: () #[[ATTR6]] {
+; CHECK-NEXT: call void @merge_cycle_0()
+; CHECK-NEXT: call void @default_captured_address()
+; CHECK-NEXT: call void @externally_visible_default()
+; CHECK-NEXT: [[F32:%.*]] = call float bitcast (i32 ()* @bitcasted_function to float ()*)()
+; CHECK-NEXT: ret void
+;
+ call void @merge_cycle_0()
+ call void @default_captured_address()
+ call void @externally_visible_default()
+ %f32 = call float bitcast (i32 ()* @bitcasted_function to float ()*)()
+ ret void
+}
+
+define internal void @default_captured_address() {
+; CHECK-LABEL: define {{[^@]+}}@default_captured_address
+; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+; CHECK-NEXT: store volatile void ()* @default_captured_address, void ()** undef, align 8
+; CHECK-NEXT: ret void
+;
+ store volatile void ()* @default_captured_address, void ()** undef, align 8
+ ret void
+}
+
+define void @externally_visible_default() {
+; CHECK-LABEL: define {{[^@]+}}@externally_visible_default
+; CHECK-SAME: () #[[ATTR8]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+; 1,1024 -> 64,256
+define internal i32 @bitcasted_function() {
+; CHECK-LABEL: define {{[^@]+}}@bitcasted_function
+; CHECK-SAME: () #[[ATTR6]] {
+; CHECK-NEXT: ret i32 0
+;
+ ret i32 0
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { "amdgpu-flat-work-group-size"="64,128" }
+attributes #2 = { "amdgpu-flat-work-group-size"="64,64" }
+attributes #3 = { "amdgpu-flat-work-group-size"="128,256" }
+attributes #4 = { "amdgpu-flat-work-group-size"="512,1024" }
+attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
+attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+;.