[mlir][GPUDialect] Implement memory attributions for LaunchOp

author Fabian Mora <fmorac@udel.edu>

Wed, 26 Apr 2023 22:52:37 +0000 (17:52 -0500)

committer max <maksim.levental@gmail.com>

Wed, 26 Apr 2023 22:53:18 +0000 (17:53 -0500)
author Fabian Mora <fmorac@udel.edu>
Wed, 26 Apr 2023 22:52:37 +0000 (17:52 -0500)
committer max <maksim.levental@gmail.com>
Wed, 26 Apr 2023 22:53:18 +0000 (17:53 -0500)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

index 860e207..e67adbc 100644 (file)
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -536,12 +536,14 @@ def GPU_LaunchOp : GPU_Op<"launch", [
      to the amount of dynamic shared memory a kernel's workgroup should be
      allocated; when this operand is not present, a zero size is assumed.
  
-    The body region has _twelve_ arguments, grouped as follows:
+    The body region has at least _twelve_ arguments, grouped as follows:
  
      -   three arguments that contain block identifiers along x,y,z dimensions;
      -   three arguments that contain thread identifiers along x,y,z dimensions;
      -   operands of the `gpu.launch` operation as is (i.e. the operands for
          grid and block sizes).
+    -   a variadic number of Workgroup memory attributions.
+    -   a variadic number of Private memory attributions.
  
      Syntax:
  
@@ -550,8 +552,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
                               `block` `(` ssa-id-list `)` `in` ssa-reassignment
                               `threads` `(` ssa-id-list `)` `in` ssa-reassignment
                               (dynamic_shared_memory_size ssa-use)?
+                             memory-attribution
                               region attr-dict?
      ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
+    memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
+                           (`private` `(` ssa-id-and-type-list `)`)?
      ```
  
      Example:
@@ -582,6 +587,18 @@ def GPU_LaunchOp : GPU_Op<"launch", [
        "some_op"(%bx, %tx) : (index, index) -> ()
        %3 = "memref.load"(%val1, %bx) : (memref<?xf32, 1>, index) -> f32
      }
+
+    // Launch with memory attributions.
+    gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
+               threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
+               workgroup(%workgroup: memref<32xf32, 3>)
+               private(%private: memref<1xf32, 5>) {
+      // Block and thread identifiers, as well as block/grid sizes are
+      // immediately usable inside body region.
+      "some_op"(%bx, %tx) : (index, index) -> ()
+      // Assuming %val1 is defined outside the gpu.launch region.
+      %42 = load %workgroup[%bx] : memref<32xf32, 3>
+    }
      ```
  
      Rationale: using operation/block arguments gives analyses a clear way of
@@ -601,7 +618,9 @@ def GPU_LaunchOp : GPU_Op<"launch", [
        "Value":$blockSizeZ,
        CArg<"Value", "nullptr">:$dynamicSharedMemorySize,
        CArg<"Type", "nullptr">:$asyncTokenType,
-      CArg<"ValueRange", "{}">:$asyncDependencies)>
+      CArg<"ValueRange", "{}">:$asyncDependencies,
+      CArg<"TypeRange", "{}">:$workgroupAttributions,
+      CArg<"TypeRange", "{}">:$privateAttributions)>
    ];
  
    let extraClassDeclaration = [{
@@ -632,6 +651,57 @@ def GPU_LaunchOp : GPU_Op<"launch", [
      /// The number of region attributes containing the launch configuration,
      /// placed in the leading positions of the argument list.
      static constexpr unsigned kNumConfigRegionAttributes = 12;
+
+    /// Returns the keywords used in the custom syntax for this Op.
+    static StringRef getWorkgroupKeyword() { return "workgroup"; }
+    static StringRef getPrivateKeyword() { return "private"; }
+
+    /// Returns the number of buffers located in the workgroup memory.
+    unsigned getNumWorkgroupAttributions() {
+      auto attr = (*this)->getAttrOfType<IntegerAttr>(
+          getNumWorkgroupAttributionsAttrName());
+      return attr ? attr.getInt() : 0;
+    }
+
+    /// Returns a list of block arguments that correspond to buffers located in
+    /// the workgroup memory
+    ArrayRef<BlockArgument> getWorkgroupAttributions() {
+      auto begin =
+          std::next(getBody().args_begin(), kNumConfigRegionAttributes);
+      auto end = std::next(begin, getNumWorkgroupAttributions());
+      return {begin, end};
+    }
+
+    /// Adds a new block argument that corresponds to buffers located in
+    /// workgroup memory.
+    BlockArgument addWorkgroupAttribution(Type type, Location loc);
+
+    /// Returns the number of buffers located in the private memory.
+    unsigned getNumPrivateAttributions() {
+      return getBody().getNumArguments() - kNumConfigRegionAttributes -
+          getNumWorkgroupAttributions();
+    }
+
+    /// Returns a list of block arguments that correspond to buffers located in
+    /// the private memory.
+    ArrayRef<BlockArgument> getPrivateAttributions() {
+      // Buffers on the private memory always come after buffers on the workgroup
+      // memory.
+      auto begin =
+          std::next(getBody().args_begin(),
+                    kNumConfigRegionAttributes + getNumWorkgroupAttributions());
+      return {begin, getBody().args_end()};
+    }
+
+    /// Adds a new block argument that corresponds to buffers located in
+    /// private memory.
+    BlockArgument addPrivateAttribution(Type type, Location loc);
+
+    /// Returns the name of the attribute containing the number of buffers
+    /// located in the workgroup memory.
+    static StringRef getNumWorkgroupAttributionsAttrName() {
+      return "workgroup_attributions";
+    }
    }];
  
    let hasCanonicalizer = 1;
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

index f9d929d..3ce6083 100644 (file)
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -332,6 +332,60 @@ static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op,
    printer << ']';
  }
  
+// GPU Memory attributions functions shared by LaunchOp and GPUFuncOp.
+/// Parses a GPU function memory attribution.
+///
+/// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
+///                        (`private` `(` ssa-id-and-type-list `)`)?
+///
+/// Note that this function parses only one of the two similar parts, with the
+/// keyword provided as argument.
+static ParseResult
+parseAttributions(OpAsmParser &parser, StringRef keyword,
+                  SmallVectorImpl<OpAsmParser::Argument> &args) {
+  // If we could not parse the keyword, just assume empty list and succeed.
+  if (failed(parser.parseOptionalKeyword(keyword)))
+    return success();
+
+  return parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren,
+                                  /*allowType=*/true);
+}
+
+/// Prints a GPU function memory attribution.
+static void printAttributions(OpAsmPrinter &p, StringRef keyword,
+                              ArrayRef<BlockArgument> values) {
+  if (values.empty())
+    return;
+
+  p << ' ' << keyword << '(';
+  llvm::interleaveComma(
+      values, p, [&p](BlockArgument v) { p << v << " : " << v.getType(); });
+  p << ')';
+}
+
+/// Verifies a GPU function memory attribution.
+static LogicalResult verifyAttributions(Operation *op,
+                                        ArrayRef<BlockArgument> attributions,
+                                        gpu::AddressSpace memorySpace) {
+  for (Value v : attributions) {
+    auto type = v.getType().dyn_cast<MemRefType>();
+    if (!type)
+      return op->emitOpError() << "expected memref type in attribution";
+
+    // We can only verify the address space if it hasn't already been lowered
+    // from the AddressSpaceAttr to a target-specific numeric value.
+    auto addressSpace =
+        type.getMemorySpace().dyn_cast_or_null<gpu::AddressSpaceAttr>();
+    if (!addressSpace)
+      continue;
+    if (addressSpace.getValue() != memorySpace)
+      return op->emitOpError()
+             << "expected memory space " << stringifyAddressSpace(memorySpace)
+             << " in attribution";
+  }
+  return success();
+}
+
  //===----------------------------------------------------------------------===//
  // AllReduceOp
  //===----------------------------------------------------------------------===//
@@ -439,7 +493,15 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
                       Value gridSizeX, Value gridSizeY, Value gridSizeZ,
                       Value getBlockSizeX, Value getBlockSizeY,
                       Value getBlockSizeZ, Value dynamicSharedMemorySize,
-                     Type asyncTokenType, ValueRange asyncDependencies) {
+                     Type asyncTokenType, ValueRange asyncDependencies,
+                     TypeRange workgroupAttributions,
+                     TypeRange privateAttributions) {
+  // Add a WorkGroup attribution attribute. This attribute is required to
+  // identify private attributions in the list of block argguments.
+  result.addAttribute(getNumWorkgroupAttributionsAttrName(),
+                      builder.getI64IntegerAttr(workgroupAttributions.size()));
+
+  // Add Op operands.
    result.addOperands(asyncDependencies);
    if (asyncTokenType)
      result.types.push_back(builder.getType<AsyncTokenType>());
@@ -450,14 +512,21 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
    if (dynamicSharedMemorySize)
      result.addOperands(dynamicSharedMemorySize);
  
-  // Create a kernel body region with kNumConfigRegionAttributes + N arguments,
-  // where the first kNumConfigRegionAttributes arguments have `index` type and
-  // the rest have the same types as the data operands.
+  // Create a kernel body region with kNumConfigRegionAttributes + N memory
+  // attributions, where the first kNumConfigRegionAttributes arguments have
+  // `index` type and the rest have the same types as the data operands.
    Region *kernelRegion = result.addRegion();
    Block *body = new Block();
+  // TODO: Allow passing in proper locations here.
    for (unsigned i = 0; i < kNumConfigRegionAttributes; ++i)
      body->addArgument(builder.getIndexType(), result.location);
+  // Add WorkGroup & Private attributions to the region arguments.
+  for (Type argTy : workgroupAttributions)
+    body->addArgument(argTy, result.location);
+  for (Type argTy : privateAttributions)
+    body->addArgument(argTy, result.location);
    kernelRegion->push_back(body);
+  // Fill OperandSegmentSize Attribute.
    SmallVector<int32_t, 8> segmentSizes(8, 1);
    segmentSizes.front() = asyncDependencies.size();
    segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0;
@@ -504,13 +573,18 @@ LogicalResult LaunchOp::verifyRegions() {
    // sizes and transforms them into kNumConfigRegionAttributes region arguments
    // for block/thread identifiers and grid/block sizes.
    if (!getBody().empty()) {
-    if (getBody().getNumArguments() !=
-        LaunchOp::kNumConfigOperands + getNumOperands() -
-            (getDynamicSharedMemorySize() ? 1 : 0) -
-            getAsyncDependencies().size())
+    if (getBody().getNumArguments() <
+        kNumConfigRegionAttributes + getNumWorkgroupAttributions())
        return emitOpError("unexpected number of region arguments");
    }
  
+  // Verify Attributions Address Spaces.
+  if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(),
+                                GPUDialect::getWorkgroupAddressSpace())) ||
+      failed(verifyAttributions(getOperation(), getPrivateAttributions(),
+                                GPUDialect::getPrivateAddressSpace())))
+    return failure();
+
    // Block terminators without successors are expected to exit the kernel region
    // and must be `gpu.terminator`.
    for (Block &block : getBody()) {
@@ -563,10 +637,15 @@ void LaunchOp::print(OpAsmPrinter &p) {
      p << ' ' << getDynamicSharedMemorySizeKeyword() << ' '
        << getDynamicSharedMemorySize();
  
+  printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions());
+  printAttributions(p, getPrivateKeyword(), getPrivateAttributions());
+
    p << ' ';
+
    p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
    p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{
-                              LaunchOp::getOperandSegmentSizeAttr()});
+                              LaunchOp::getOperandSegmentSizeAttr(),
+                              getNumWorkgroupAttributionsAttrName()});
  }
  
  // Parse the size assignment blocks for blocks and threads.  These have the form
@@ -601,8 +680,9 @@ parseSizeAssignment(OpAsmParser &parser,
  
  /// Parses a Launch operation.
  /// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)?
-//        `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
+///       `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
  ///       `threads` `(` ssa-id-list `)` `in` ssa-reassignment
+///       memory-attribution
  ///       region attr-dict?
  /// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
  ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
@@ -659,9 +739,12 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
        return failure();
    }
  
-  // Introduce the body region and parse it. The region has
-  // kNumConfigRegionAttributes arguments that correspond to
-  // block/thread identifiers and grid/block sizes, all of the `index` type.
+  // Create the region arguments, it has kNumConfigRegionAttributes arguments
+  // that correspond to block/thread identifiers and grid/block sizes, all
+  // having `index` type, a variadic number of WorkGroup Attributions and
+  // a variadic number of Private Attributions. The number of WorkGroup
+  // Attributions is stored in the attr with name:
+  // LaunchOp::getNumWorkgroupAttributionsAttrName().
    Type index = parser.getBuilder().getIndexType();
    SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
        LaunchOp::kNumConfigRegionAttributes, index);
@@ -674,6 +757,27 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
      regionArguments.push_back(arg);
    }
  
+  Builder &builder = parser.getBuilder();
+  // Parse workgroup memory attributions.
+  if (failed(parseAttributions(parser, LaunchOp::getWorkgroupKeyword(),
+                               regionArguments)))
+    return failure();
+
+  // Store the number of operands we just parsed as the number of workgroup
+  // memory attributions.
+  unsigned numWorkgroupAttrs =
+      regionArguments.size() - LaunchOp::kNumConfigRegionAttributes;
+  result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(),
+                      builder.getI64IntegerAttr(numWorkgroupAttrs));
+
+  // Parse private memory attributions.
+  if (failed(parseAttributions(parser, LaunchOp::getPrivateKeyword(),
+                               regionArguments)))
+    return failure();
+
+  // Introduce the body region and parse it. The region has
+  // kNumConfigRegionAttributes arguments that correspond to
+  // block/thread identifiers and grid/block sizes, all having `index` type.
    Region *body = result.addRegion();
    if (parser.parseRegion(*body, regionArguments) ||
        parser.parseOptionalAttrDict(result.attributes))
@@ -729,6 +833,25 @@ void LaunchOp::getCanonicalizationPatterns(RewritePatternSet &rewrites,
    rewrites.add<FoldLaunchArguments>(context);
  }
  
+/// Adds a new block argument that corresponds to buffers located in
+/// workgroup memory.
+BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) {
+  auto attrName = getNumWorkgroupAttributionsAttrName();
+  auto attr = (*this)->getAttrOfType<IntegerAttr>(attrName);
+  (*this)->setAttr(attrName,
+                   IntegerAttr::get(attr.getType(), attr.getValue() + 1));
+  return getBody().insertArgument(
+      LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc);
+}
+
+/// Adds a new block argument that corresponds to buffers located in
+/// private memory.
+BlockArgument LaunchOp::addPrivateAttribution(Type type, Location loc) {
+  // Buffers on the private memory always come after buffers on the workgroup
+  // memory.
+  return getBody().addArgument(type, loc);
+}
+
  //===----------------------------------------------------------------------===//
  // LaunchFuncOp
  //===----------------------------------------------------------------------===//
@@ -894,24 +1017,6 @@ void GPUFuncOp::build(OpBuilder &builder, OperationState &result,
    body->getBlocks().push_back(entryBlock);
  }
  
-/// Parses a GPU function memory attribution.
-///
-/// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
-///                        (`private` `(` ssa-id-and-type-list `)`)?
-///
-/// Note that this function parses only one of the two similar parts, with the
-/// keyword provided as argument.
-static ParseResult
-parseAttributions(OpAsmParser &parser, StringRef keyword,
-                  SmallVectorImpl<OpAsmParser::Argument> &args) {
-  // If we could not parse the keyword, just assume empty list and succeed.
-  if (failed(parser.parseOptionalKeyword(keyword)))
-    return success();
-
-  return parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren,
-                                  /*allowType=*/true);
-}
-
  /// Parses a GPU function.
  ///
  /// <operation> ::= `gpu.func` symbol-ref-id `(` argument-list `)`
@@ -985,17 +1090,6 @@ ParseResult GPUFuncOp::parse(OpAsmParser &parser, OperationState &result) {
    return parser.parseRegion(*body, entryArgs);
  }
  
-static void printAttributions(OpAsmPrinter &p, StringRef keyword,
-                              ArrayRef<BlockArgument> values) {
-  if (values.empty())
-    return;
-
-  p << ' ' << keyword << '(';
-  llvm::interleaveComma(
-      values, p, [&p](BlockArgument v) { p << v << " : " << v.getType(); });
-  p << ')';
-}
-
  void GPUFuncOp::print(OpAsmPrinter &p) {
    p << ' ';
    p.printSymbolName(getName());
@@ -1026,28 +1120,6 @@ LogicalResult GPUFuncOp::verifyType() {
    return success();
  }
  
-static LogicalResult verifyAttributions(Operation *op,
-                                        ArrayRef<BlockArgument> attributions,
-                                        gpu::AddressSpace memorySpace) {
-  for (Value v : attributions) {
-    auto type = v.getType().dyn_cast<MemRefType>();
-    if (!type)
-      return op->emitOpError() << "expected memref type in attribution";
-
-    // We can only verify the address space if it hasn't already been lowered
-    // from the AddressSpaceAttr to a target-specific numeric value.
-    auto addressSpace =
-        type.getMemorySpace().dyn_cast_or_null<gpu::AddressSpaceAttr>();
-    if (!addressSpace)
-      continue;
-    if (addressSpace.getValue() != memorySpace)
-      return op->emitOpError()
-             << "expected memory space " << stringifyAddressSpace(memorySpace)
-             << " in attribution";
-  }
-  return success();
-}
-
  /// Verifies the body of the function.
  LogicalResult GPUFuncOp::verifyBody() {
    if (empty())
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp

index 8abf759..91c1c76 100644 (file)
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -190,7 +190,10 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
    }
    FunctionType type =
        FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
-  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFnName, type);
+  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(
+      loc, kernelFnName, type,
+      TypeRange(ValueRange(launchOp.getWorkgroupAttributions())),
+      TypeRange(ValueRange(launchOp.getPrivateAttributions())));
    outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
                          builder.getUnitAttr());
  
@@ -213,6 +216,16 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
    Region &outlinedFuncBody = outlinedFunc.getBody();
    injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
  
+  // Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
+  for (const auto &[launchArg, funcArg] :
+       llvm::zip(launchOp.getWorkgroupAttributions(),
+                 outlinedFunc.getWorkgroupAttributions()))
+    map.map(launchArg, funcArg);
+  for (const auto &[launchArg, funcArg] :
+       llvm::zip(launchOp.getPrivateAttributions(),
+                 outlinedFunc.getPrivateAttributions()))
+    map.map(launchArg, funcArg);
+
    // Map arguments from gpu.launch region to the arguments of the gpu.func
    // operation.
    Block &entryBlock = outlinedFuncBody.front();
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir

index 422e0c1..ca77696 100644 (file)
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -310,3 +310,65 @@ func.func @non_constant_launches(%arg0 : index) {
  }
  
  // CHECK-DL-LABEL: gpu.module @non_constant_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
+// CHECK: module attributes {gpu.container_module}
+
+// -----
+
+// This test checks memory attributions for gpu.launch, using both workgroup and private attributions.
+// CHECK-LABEL: func @launch_memory_attributions_0()
+func.func @launch_memory_attributions_0() {
+  %1 = "op"() : () -> (memref<?xf32, 1>)
+  %128 = arith.constant 128 : index
+
+  // CHECK: gpu.launch_func @launch_memory_attributions_0_kernel::@launch_memory_attributions_0_kernel
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %128, %grid_y = %128,
+                                       %grid_z = %128)
+             threads(%tx, %ty, %tz) in (%block_x = %128, %block_y = %128,
+                                        %block_z = %128)
+             workgroup(%shared: memref<42xf32, 3>)
+             private(%priv0: memref<2xf32, 5>, %priv1: memref<1xf32, 5>) {
+    "some_op"(%bx, %block_x) : (index, index) -> ()
+    %42 = memref.load %1[%tx] : memref<?xf32, 1>
+    %43 = memref.load %shared[%tx] : memref<42xf32, 3>
+    %44 = memref.load %priv1[%tx] : memref<1xf32, 5>
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_0_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
+// CHECK-LABEL: gpu.module @launch_memory_attributions_0_kernel
+// CHECK-NEXT: gpu.func @launch_memory_attributions_0_kernel
+// CHECK-SAME: workgroup(%[[KERNEL_ARG1:.*]] : memref<42xf32, 3>)
+// CHECK-SAME: private(%[[KERNEL_ARG2:.*]] : memref<2xf32, 5>, %[[KERNEL_ARG3:.*]] : memref<1xf32, 5>)
+// CHECK: %[[TID:.*]] = gpu.thread_id x
+// CHECK: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<42xf32, 3>
+// CHECK-NEXT: = memref.load %[[KERNEL_ARG3]][%[[TID]]] : memref<1xf32, 5>
+
+// -----
+
+// This test checks correctness of private attributions in the absence of workgroup attributions.
+// CHECK-LABEL: @launch_memory_attributions_1
+func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %d = memref.dim %arg0, %c2 : memref<*xf32>
+  // CHECK: gpu.func {{.*}}  private(%[[KERNEL_ARG:.*]] : memref<3xf32, 5>) {{.*}} {
+  // CHECK:   %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: = memref.load %[[KERNEL_ARG]][%[[C2]]] : memref<3xf32, 5>
+  // CHECK:   gpu.return
+  // CHECK: }
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1,
+                                       %grid_z = %c1)
+             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1,
+                                        %block_z = %c1)
+             private(%priv0: memref<3xf32, 5>) {
+    %42 = memref.load %priv0[%c2] : memref<3xf32, 5>
+    gpu.terminator
+  }
+  return
+}
+
+// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
author	Fabian Mora <fmorac@udel.edu>
	Wed, 26 Apr 2023 22:52:37 +0000 (17:52 -0500)
committer	max <maksim.levental@gmail.com>
	Wed, 26 Apr 2023 22:53:18 +0000 (17:53 -0500)
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td		patch \| blob \| history
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp		patch \| blob \| history
mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp		patch \| blob \| history
mlir/test/Dialect/GPU/outlining.mlir		patch \| blob \| history