/// Get the name of the attribute used to annotate external kernel
/// functions.
static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
+ static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() {
+ return ::llvm::StringLiteral("rocdl.flat_work_group_size");
+ }
+ static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
+ return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
+ }
}];
}
list<Trait> traits = []> :
ROCDL_Op<mnemonic, !listconcat(traits, [Pure])>,
Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {
- string llvmBuilder = "$res = createIntrinsicCall(builder,"
- # "llvm::Intrinsic::amdgcn_" # !subst(".","_", mnemonic) # ");";
+ string llvmBuilder = "$res = createIntrinsicCallWithRange(builder,"
+ # "llvm::Intrinsic::amdgcn_" # !subst(".","_", mnemonic)
+ # ", op->getAttrOfType<::mlir::DenseI32ArrayAttr>(\"range\"));";
let assemblyFormat = "attr-dict `:` type($res)";
}
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
namespace mlir {
struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
private:
unsigned indexBitwidth;
+ StringRef boundsAttrName;
public:
explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter)
: ConvertOpToLLVMPattern<Op>(typeConverter),
- indexBitwidth(typeConverter.getIndexTypeBitwidth()) {}
+ indexBitwidth(typeConverter.getIndexTypeBitwidth()),
+ boundsAttrName("") {}
+
+ explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter,
+ StringRef boundsAttrName)
+ : ConvertOpToLLVMPattern<Op>(typeConverter),
+ indexBitwidth(typeConverter.getIndexTypeBitwidth()),
+ boundsAttrName(boundsAttrName) {}
// Convert the kernel arguments to an LLVM type, preserve the rest.
LogicalResult
ConversionPatternRewriter &rewriter) const override {
auto loc = op->getLoc();
MLIRContext *context = rewriter.getContext();
- Value newOp;
+ Operation *newOp;
switch (op.getDimension()) {
case gpu::Dimension::x:
newOp = rewriter.create<XOp>(loc, IntegerType::get(context, 32));
break;
}
+ Operation *function;
+ if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>())
+ function = gpuFunc;
+ if (auto llvmFunc = op->template getParentOfType<LLVM::LLVMFuncOp>())
+ function = llvmFunc;
+ if (!boundsAttrName.empty() && function) {
+ if (auto attr = function->template getAttrOfType<DenseI32ArrayAttr>(
+ boundsAttrName)) {
+ int32_t maximum = attr[static_cast<uint32_t>(op.getDimension())];
+ newOp->setAttr("range", rewriter.getDenseI32ArrayAttr({0, maximum}));
+ }
+ }
+
if (indexBitwidth > 32) {
newOp = rewriter.create<LLVM::SExtOp>(
- loc, IntegerType::get(context, indexBitwidth), newOp);
+ loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
} else if (indexBitwidth < 32) {
newOp = rewriter.create<LLVM::TruncOp>(
- loc, IntegerType::get(context, indexBitwidth), newOp);
+ loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
}
- rewriter.replaceOp(op, {newOp});
+ rewriter.replaceOp(op, newOp->getResults());
return success();
}
};
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
configureGpuToROCDLConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
signalPassFailure();
+
+ // Manually rewrite known block size attributes so the LLVMIR translation
+ // infrastructure can pick them up.
+ m.walk([ctx](LLVM::LLVMFuncOp op) {
+ if (auto blockSizes =
+ op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName())
+ .dyn_cast_or_null<DenseI32ArrayAttr>()) {
+ op->setAttr(ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName(),
+ blockSizes);
+ // Also set up the rocdl.flat_work_group_size attribute to prevent
+ // conflicting metadata.
+ uint32_t flatSize = 1;
+ for (uint32_t size : blockSizes.asArrayRef()) {
+ flatSize *= size;
+ }
+ StringAttr flatSizeAttr =
+ StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
+ op->setAttr(ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName(),
+ flatSizeAttr);
+ }
+ });
}
};
populateWithGenerated(patterns);
patterns
.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
- ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
- GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
+ ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
+ converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName());
+ patterns.add<GPUIndexIntrinsicOpLowering<
+ gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
+ converter, gpu::GPUFuncOp::getKnownGridSizeAttrName());
+ patterns
+ .add<GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
- GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
- ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
GPUReturnOpLowering>(converter);
#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/Operation.h"
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/Support/raw_ostream.h"
using namespace mlir;
using namespace mlir::LLVM;
using mlir::LLVM::detail::createIntrinsicCall;
+static llvm::Value *createIntrinsicCallWithRange(llvm::IRBuilderBase &builder,
+ llvm::Intrinsic::ID intrinsic,
+ DenseI32ArrayAttr maybeRange) {
+ auto *inst = llvm::cast<llvm::CallInst>(
+ createIntrinsicCall(builder, intrinsic, {}, {}));
+ if (maybeRange) {
+ SmallVector<llvm::APInt, 2> apInts;
+ for (int32_t i : maybeRange.asArrayRef())
+ apInts.push_back(llvm::APInt(32, i));
+ llvm::MDBuilder mdBuilder(builder.getContext());
+ llvm::MDNode *range = mdBuilder.createRange(apInts[0], apInts[1]);
+ inst->setMetadata(llvm::LLVMContext::MD_range, range);
+ }
+ return inst;
+}
+
// Create a call to ROCm-Device-Library function
// Currently this routine will work only for calling ROCDL functions that
// take a single int32 argument. It is likely that the interface of this
moduleTranslation.lookupFunction(func.getName());
llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
- llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 256");
+ llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
}
llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56");
}
// Override flat-work-group-size
+ // TODO: update clients to rocdl.flat_work_group_size instead,
+ // then remove this half of the branch
if ("rocdl.max_flat_work_group_size" == attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
moduleTranslation.lookupFunction(func.getName());
llvm::SmallString<8> llvmAttrValue;
llvm::raw_svector_ostream attrValueStream(llvmAttrValue);
- attrValueStream << "1, " << value.getInt();
+ attrValueStream << "1," << value.getInt();
+ llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
+ }
+ if (ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName() ==
+ attribute.getName()) {
+ auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+ if (!func)
+ return failure();
+ auto value = attribute.getValue().dyn_cast<StringAttr>();
+ if (!value)
+ return failure();
+
+ llvm::Function *llvmFunc =
+ moduleTranslation.lookupFunction(func.getName());
+ llvm::SmallString<8> llvmAttrValue;
+ llvmAttrValue.append(value.getValue());
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
}
+
+ // Set reqd_work_group_size metadata
+ if (ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName() ==
+ attribute.getName()) {
+ auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+ if (!func)
+ return failure();
+ auto value = attribute.getValue().dyn_cast<DenseI32ArrayAttr>();
+ if (!value)
+ return failure();
+ llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
+ SmallVector<llvm::Metadata *, 3> metadata;
+ llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
+ for (int32_t i : value.asArrayRef()) {
+ llvm::Constant *constant = llvm::ConstantInt::get(i32, i);
+ metadata.push_back(llvm::ConstantAsMetadata::get(constant));
+ }
+ llvm::Function *llvmFunc =
+ moduleTranslation.lookupFunction(func.getName());
+ llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata);
+ llvmFunc->setMetadata("reqd_work_group_size", node);
+ }
return success();
}
};
// -----
gpu.module @test_module {
+ // CHECK-LABEL: func @gpu_index_ops_range()
+ // CHECK-SAME: rocdl.flat_work_group_size = "1536,1536"
+ // CHECK-SAME: rocdl.reqd_work_group_size = array<i32: 8, 12, 16>
+ func.func @gpu_index_ops_range()
+ -> (index, index, index, index, index, index) attributes
+ {gpu.known_block_size = array<i32: 8, 12, 16>,
+ gpu.known_grid_size = array<i32: 20, 24, 28>} {
+
+ // CHECK: rocdl.workitem.id.x {range = array<i32: 0, 8>} : i32
+ %tIdX = gpu.thread_id x
+ // CHECK: rocdl.workitem.id.y {range = array<i32: 0, 12>} : i32
+ %tIdY = gpu.thread_id y
+ // CHECK: rocdl.workitem.id.z {range = array<i32: 0, 16>} : i32
+ %tIdZ = gpu.thread_id z
+
+ // CHECK: rocdl.workgroup.id.x {range = array<i32: 0, 20>} : i32
+ %bIdX = gpu.block_id x
+ // CHECK: rocdl.workgroup.id.y {range = array<i32: 0, 24>} : i32
+ %bIdY = gpu.block_id y
+ // CHECK: rocdl.workgroup.id.z {range = array<i32: 0, 28>} : i32
+ %bIdZ = gpu.block_id z
+
+ func.return %tIdX, %tIdY, %tIdZ, %bIdX, %bIdY, %bIdZ
+ : index, index, index, index, index, index
+ }
+}
+
+// -----
+
+gpu.module @test_module {
// CHECK-LABEL: func @gpu_index_comp
// CHECK32-LABEL: func @gpu_index_comp
func.func @gpu_index_comp(%idx : index) -> index {
%11 = rocdl.grid.dim.y : i64
// CHECK: call i64 @__ockl_get_global_size(i32 2)
%12 = rocdl.grid.dim.z : i64
+
+ // CHECK: call i32 @llvm.amdgcn.workitem.id.x(),{{.*}} !range ![[$RANGE:[0-9]+]]
+ %13 = rocdl.workitem.id.x {range = array<i32: 0, 64>} : i32
+
llvm.return %1 : i32
}
llvm.return
}
+llvm.func @known_block_sizes()
+ attributes {rocdl.kernel,
+ rocdl.flat_work_group_size = "128,128",
+ rocdl.reqd_work_group_size = array<i32: 16, 4, 2>} {
+ // CHECK-LABEL: amdgpu_kernel void @known_block_sizes()
+ // CHECK: #[[$KNOWN_BLOCK_SIZE_ATTRS:[0-9]+]]
+ // CHECK: !reqd_work_group_size ![[$REQD_WORK_GROUP_SIZE:[0-9]+]]
+ llvm.return
+}
+
llvm.func @rocdl.barrier() {
// CHECK: fence syncscope("workgroup") release
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
llvm.return
}
-// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 256" "amdgpu-implicitarg-num-bytes"="56" }
-// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 1024"
+// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" }
+// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
+// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
+// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
+// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}