[mlir][ROCDL] Translate known block size attributes to ROCDL

author Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>

Mon, 5 Dec 2022 22:17:34 +0000 (22:17 +0000)

committer Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>

Mon, 2 Jan 2023 21:04:13 +0000 (21:04 +0000)
author Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Mon, 5 Dec 2022 22:17:34 +0000 (22:17 +0000)
committer Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Mon, 2 Jan 2023 21:04:13 +0000 (21:04 +0000)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

index fa6b517..a39c41d 100644 (file)
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -30,6 +30,12 @@ def ROCDL_Dialect : Dialect {
      /// Get the name of the attribute used to annotate external kernel
      /// functions.
      static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
+    static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.flat_work_group_size");
+    }
+    static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
+      return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
+    }
    }];
  }
  
@@ -49,8 +55,9 @@ class ROCDL_SpecialRegisterOp<string mnemonic,
      list<Trait> traits = []> :
    ROCDL_Op<mnemonic, !listconcat(traits, [Pure])>,
    Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {
-  string llvmBuilder = "$res = createIntrinsicCall(builder,"
-    # "llvm::Intrinsic::amdgcn_" # !subst(".","_", mnemonic) # ");";
+  string llvmBuilder = "$res = createIntrinsicCallWithRange(builder,"
+    # "llvm::Intrinsic::amdgcn_" # !subst(".","_", mnemonic)
+    # ", op->getAttrOfType<::mlir::DenseI32ArrayAttr>(\"range\"));";
    let assemblyFormat = "attr-dict `:` type($res)";
  }
  
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h

index f26b63e..d067c70 100644 (file)
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -11,6 +11,7 @@
  #include "mlir/Conversion/LLVMCommon/Pattern.h"
  #include "mlir/Dialect/GPU/IR/GPUDialect.h"
  #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
  
  namespace mlir {
  
@@ -23,11 +24,19 @@ template <typename Op, typename XOp, typename YOp, typename ZOp>
  struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
  private:
    unsigned indexBitwidth;
+  StringRef boundsAttrName;
  
  public:
    explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter)
        : ConvertOpToLLVMPattern<Op>(typeConverter),
-        indexBitwidth(typeConverter.getIndexTypeBitwidth()) {}
+        indexBitwidth(typeConverter.getIndexTypeBitwidth()),
+        boundsAttrName("") {}
+
+  explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter,
+                                       StringRef boundsAttrName)
+      : ConvertOpToLLVMPattern<Op>(typeConverter),
+        indexBitwidth(typeConverter.getIndexTypeBitwidth()),
+        boundsAttrName(boundsAttrName) {}
  
    // Convert the kernel arguments to an LLVM type, preserve the rest.
    LogicalResult
@@ -35,7 +44,7 @@ public:
                    ConversionPatternRewriter &rewriter) const override {
      auto loc = op->getLoc();
      MLIRContext *context = rewriter.getContext();
-    Value newOp;
+    Operation *newOp;
      switch (op.getDimension()) {
      case gpu::Dimension::x:
        newOp = rewriter.create<XOp>(loc, IntegerType::get(context, 32));
@@ -48,15 +57,28 @@ public:
        break;
      }
  
+    Operation *function;
+    if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>())
+      function = gpuFunc;
+    if (auto llvmFunc = op->template getParentOfType<LLVM::LLVMFuncOp>())
+      function = llvmFunc;
+    if (!boundsAttrName.empty() && function) {
+      if (auto attr = function->template getAttrOfType<DenseI32ArrayAttr>(
+              boundsAttrName)) {
+        int32_t maximum = attr[static_cast<uint32_t>(op.getDimension())];
+        newOp->setAttr("range", rewriter.getDenseI32ArrayAttr({0, maximum}));
+      }
+    }
+
      if (indexBitwidth > 32) {
        newOp = rewriter.create<LLVM::SExtOp>(
-          loc, IntegerType::get(context, indexBitwidth), newOp);
+          loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
      } else if (indexBitwidth < 32) {
        newOp = rewriter.create<LLVM::TruncOp>(
-          loc, IntegerType::get(context, indexBitwidth), newOp);
+          loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
      }
  
-    rewriter.replaceOp(op, {newOp});
+    rewriter.replaceOp(op, newOp->getResults());
      return success();
    }
  };
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

index 1f81590..1f34fe6 100644 (file)
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -26,9 +26,11 @@
  #include "mlir/Dialect/Func/IR/FuncOps.h"
  #include "mlir/Dialect/GPU/IR/GPUDialect.h"
  #include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
  #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
  #include "mlir/Dialect/Math/IR/Math.h"
  #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
  #include "mlir/Pass/Pass.h"
  #include "mlir/Transforms/DialectConversion.h"
  #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -139,6 +141,27 @@ struct LowerGpuOpsToROCDLOpsPass
      configureGpuToROCDLConversionLegality(target);
      if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
        signalPassFailure();
+
+    // Manually rewrite known block size attributes so the LLVMIR translation
+    // infrastructure can pick them up.
+    m.walk([ctx](LLVM::LLVMFuncOp op) {
+      if (auto blockSizes =
+              op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName())
+                  .dyn_cast_or_null<DenseI32ArrayAttr>()) {
+        op->setAttr(ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName(),
+                    blockSizes);
+        // Also set up the rocdl.flat_work_group_size attribute to prevent
+        // conflicting metadata.
+        uint32_t flatSize = 1;
+        for (uint32_t size : blockSizes.asArrayRef()) {
+          flatSize *= size;
+        }
+        StringAttr flatSizeAttr =
+            StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
+        op->setAttr(ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName(),
+                    flatSizeAttr);
+      }
+    });
    }
  };
  
@@ -173,11 +196,14 @@ void mlir::populateGpuToROCDLConversionPatterns(
    populateWithGenerated(patterns);
    patterns
        .add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
-                                       ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
-           GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
+                                       ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
+          converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName());
+  patterns.add<GPUIndexIntrinsicOpLowering<
+      gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
+      converter, gpu::GPUFuncOp::getKnownGridSizeAttrName());
+  patterns
+      .add<GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
                                         ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
-           GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
-                                       ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
             GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
                                         ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
             GPUReturnOpLowering>(converter);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp

index 71d0a61..826bac9 100644 (file)
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -13,17 +13,35 @@
  
  #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
  #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
  #include "mlir/IR/Operation.h"
  #include "mlir/Target/LLVMIR/ModuleTranslation.h"
  
  #include "llvm/IR/IRBuilder.h"
  #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
  #include "llvm/Support/raw_ostream.h"
  
  using namespace mlir;
  using namespace mlir::LLVM;
  using mlir::LLVM::detail::createIntrinsicCall;
  
+static llvm::Value *createIntrinsicCallWithRange(llvm::IRBuilderBase &builder,
+                                                 llvm::Intrinsic::ID intrinsic,
+                                                 DenseI32ArrayAttr maybeRange) {
+  auto *inst = llvm::cast<llvm::CallInst>(
+      createIntrinsicCall(builder, intrinsic, {}, {}));
+  if (maybeRange) {
+    SmallVector<llvm::APInt, 2> apInts;
+    for (int32_t i : maybeRange.asArrayRef())
+      apInts.push_back(llvm::APInt(32, i));
+    llvm::MDBuilder mdBuilder(builder.getContext());
+    llvm::MDNode *range = mdBuilder.createRange(apInts[0], apInts[1]);
+    inst->setMetadata(llvm::LLVMContext::MD_range, range);
+  }
+  return inst;
+}
+
  // Create a call to ROCm-Device-Library function
  // Currently this routine will work only for calling ROCDL functions that
  // take a single int32 argument. It is likely that the interface of this
@@ -80,11 +98,13 @@ public:
            moduleTranslation.lookupFunction(func.getName());
        llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
        if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
-        llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 256");
+        llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
        }
        llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56");
      }
      // Override flat-work-group-size
+    // TODO: update clients to rocdl.flat_work_group_size instead,
+    // then remove this half of the branch
      if ("rocdl.max_flat_work_group_size" == attribute.getName()) {
        auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
        if (!func)
@@ -97,9 +117,46 @@ public:
            moduleTranslation.lookupFunction(func.getName());
        llvm::SmallString<8> llvmAttrValue;
        llvm::raw_svector_ostream attrValueStream(llvmAttrValue);
-      attrValueStream << "1, " << value.getInt();
+      attrValueStream << "1," << value.getInt();
+      llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
+    }
+    if (ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName() ==
+        attribute.getName()) {
+      auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+      if (!func)
+        return failure();
+      auto value = attribute.getValue().dyn_cast<StringAttr>();
+      if (!value)
+        return failure();
+
+      llvm::Function *llvmFunc =
+          moduleTranslation.lookupFunction(func.getName());
+      llvm::SmallString<8> llvmAttrValue;
+      llvmAttrValue.append(value.getValue());
        llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
      }
+
+    // Set reqd_work_group_size metadata
+    if (ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName() ==
+        attribute.getName()) {
+      auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
+      if (!func)
+        return failure();
+      auto value = attribute.getValue().dyn_cast<DenseI32ArrayAttr>();
+      if (!value)
+        return failure();
+      llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
+      SmallVector<llvm::Metadata *, 3> metadata;
+      llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
+      for (int32_t i : value.asArrayRef()) {
+        llvm::Constant *constant = llvm::ConstantInt::get(i32, i);
+        metadata.push_back(llvm::ConstantAsMetadata::get(constant));
+      }
+      llvm::Function *llvmFunc =
+          moduleTranslation.lookupFunction(func.getName());
+      llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata);
+      llvmFunc->setMetadata("reqd_work_group_size", node);
+    }
      return success();
    }
  };
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

index 918fdcc..5751d76 100644 (file)
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -59,6 +59,36 @@ gpu.module @test_module {
  // -----
  
  gpu.module @test_module {
+  // CHECK-LABEL: func @gpu_index_ops_range()
+  // CHECK-SAME: rocdl.flat_work_group_size = "1536,1536"
+  // CHECK-SAME: rocdl.reqd_work_group_size = array<i32: 8, 12, 16>
+  func.func @gpu_index_ops_range()
+      -> (index, index, index, index, index, index) attributes
+      {gpu.known_block_size = array<i32: 8, 12, 16>,
+       gpu.known_grid_size = array<i32: 20, 24, 28>} {
+
+    // CHECK: rocdl.workitem.id.x {range = array<i32: 0, 8>} : i32
+    %tIdX = gpu.thread_id x
+    // CHECK: rocdl.workitem.id.y {range = array<i32: 0, 12>} : i32
+    %tIdY = gpu.thread_id y
+    // CHECK: rocdl.workitem.id.z {range = array<i32: 0, 16>} : i32
+    %tIdZ = gpu.thread_id z
+
+    // CHECK: rocdl.workgroup.id.x {range = array<i32: 0, 20>} : i32
+    %bIdX = gpu.block_id x
+    // CHECK: rocdl.workgroup.id.y {range = array<i32: 0, 24>} : i32
+    %bIdY = gpu.block_id y
+    // CHECK: rocdl.workgroup.id.z {range = array<i32: 0, 28>} : i32
+    %bIdZ = gpu.block_id z
+
+    func.return %tIdX, %tIdY, %tIdZ, %bIdX, %bIdY, %bIdZ
+        : index, index, index, index, index, index
+  }
+}
+
+// -----
+
+gpu.module @test_module {
    // CHECK-LABEL: func @gpu_index_comp
    // CHECK32-LABEL: func @gpu_index_comp
    func.func @gpu_index_comp(%idx : index) -> index {
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir

index 8a67320..a6ca45c 100644 (file)
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -26,6 +26,10 @@ llvm.func @rocdl_special_regs() -> i32 {
    %11 = rocdl.grid.dim.y : i64
    // CHECK: call i64 @__ockl_get_global_size(i32 2)
    %12 = rocdl.grid.dim.z : i64
+
+  // CHECK: call i32 @llvm.amdgcn.workitem.id.x(),{{.*}} !range ![[$RANGE:[0-9]+]]
+  %13 = rocdl.workitem.id.x {range = array<i32: 0, 64>} : i32
+
    llvm.return %1 : i32
  }
  
@@ -42,6 +46,16 @@ llvm.func @kernel_func_workgroups()
    llvm.return
  }
  
+llvm.func @known_block_sizes()
+    attributes {rocdl.kernel,
+      rocdl.flat_work_group_size = "128,128",
+      rocdl.reqd_work_group_size = array<i32: 16, 4, 2>} {
+  // CHECK-LABEL: amdgpu_kernel void @known_block_sizes()
+  // CHECK: #[[$KNOWN_BLOCK_SIZE_ATTRS:[0-9]+]]
+  // CHECK: !reqd_work_group_size ![[$REQD_WORK_GROUP_SIZE:[0-9]+]]
+  llvm.return
+}
+
  llvm.func @rocdl.barrier() {
    // CHECK:      fence syncscope("workgroup") release
    // CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
@@ -225,5 +239,8 @@ llvm.func @rocdl.raw.buffer.atomic(%rsrc : vector<4xi32>,
    llvm.return
  }
  
-// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 256" "amdgpu-implicitarg-num-bytes"="56" }
-// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 1024"
+// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" }
+// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
+// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
+// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
+// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}
author	Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
	Mon, 5 Dec 2022 22:17:34 +0000 (22:17 +0000)
committer	Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
	Mon, 2 Jan 2023 21:04:13 +0000 (21:04 +0000)
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td		patch \| blob \| history
mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h		patch \| blob \| history
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp		patch \| blob \| history
mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp		patch \| blob \| history
mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir		patch \| blob \| history
mlir/test/Target/LLVMIR/rocdl.mlir		patch \| blob \| history