From 08778d8c4fd8a6519c7f27bfa6b09c47262cb844 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Tue, 7 Jan 2020 20:00:54 +0100
Subject: [PATCH] [mlir][GPU] introduce utilities for promotion to workgroup
 memory

Introduce a set of function that promote a memref argument of a `gpu.func` to
workgroup memory using memory attribution. The promotion boils down to
additional loops performing the copy from the original argument to the
attributed memory in the beginning of the function, and back at the end of the
function using all available threads. The loop bounds are specified so as to
adapt to any size of the workgroup. These utilities are intended to compose
with other existing utilities (loop coalescing and tiling) in cases where the
distribution of work across threads is uneven, e.g. copying a 2D memref with
only the threads along the "x" dimension. Similarly, specialization of the
kernel to specific launch sizes should be implemented as a separate pass
combining constant propagation and canonicalization.

Introduce a simple attribute-driven pass to test the promotion transformation
since we don't have a heuristic at the moment.

Differential revision: https://reviews.llvm.org/D71904
---
 mlir/include/mlir/Dialect/GPU/GPUDialect.h         |   5 +
 mlir/include/mlir/Dialect/GPU/GPUOps.td            |   4 +
 mlir/include/mlir/Dialect/GPU/MemoryPromotion.h    |  29 ++++
 mlir/include/mlir/IR/Block.h                       |   5 +
 mlir/lib/Dialect/GPU/CMakeLists.txt                |  20 ++-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp             |  18 +++
 .../lib/Dialect/GPU/Transforms/MemoryPromotion.cpp | 173 +++++++++++++++++++++
 mlir/lib/IR/Block.cpp                              |  14 ++
 mlir/test/Dialect/GPU/promotion.mlir               | 119 ++++++++++++++
 mlir/test/lib/Transforms/CMakeLists.txt            |   3 +
 .../test/lib/Transforms/TestGpuMemoryPromotion.cpp |  40 +++++
 11 files changed, 428 insertions(+), 2 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
 create mode 100644 mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
 create mode 100644 mlir/test/Dialect/GPU/promotion.mlir
 create mode 100644 mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp

diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
index 1776ff7..a21b514 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -53,6 +53,11 @@ public:
   /// 'gpu.kernel' attribute.
   static bool isKernel(Operation *op);
 
+  /// Returns the number of workgroup (thread, block) dimensions supported in
+  /// the GPU dialect.
+  // TODO(zinenko,herhut): consider generalizing this.
+  static unsigned getNumWorkgroupDimensions() { return 3; }
+
   /// Returns the numeric value used to identify the workgroup memory address
   /// space.
   static unsigned getWorkgroupAddressSpace() { return 3; }
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index b5b93e9..766ddbf 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -117,6 +117,10 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
   ];
 
   let extraClassDeclaration = [{
+    /// Adds a workgroup attribution of the MemRef type with the given shape and
+    /// element type.
+    Value addWorkgroupAttribution(ArrayRef<int64_t> shape, Type elementType);
+
     /// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
     /// it is intended to be launched from host.
     bool isKernel() {
diff --git a/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
new file mode 100644
index 0000000..09c1371
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
@@ -0,0 +1,29 @@
+//===- MemoryPromotion.h - Utilities for moving data across GPU -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utility functions that generate IR copying
+// the data between different levels of memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+#define MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+
+namespace mlir {
+
+namespace gpu {
+class GPUFuncOp;
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_MEMORYPROMOTION_H
diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h
index c868148..2d3eb18 100644
--- a/mlir/include/mlir/IR/Block.h
+++ b/mlir/include/mlir/IR/Block.h
@@ -79,6 +79,11 @@ public:
   /// Add one value to the argument list.
   BlockArgument addArgument(Type type);
 
+  /// Insert one value to the position in the argument list indicated by the
+  /// given iterator. The existing arguments are shifted. The block is expected
+  /// not to have predecessors.
+  BlockArgument insertArgument(args_iterator it, Type type);
+
   /// Add one argument to the argument list for each type specified in the list.
   iterator_range<args_iterator> addArguments(ArrayRef<Type> types);
 
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 6fe45ba..dbf05ac 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -2,9 +2,25 @@ add_llvm_library(MLIRGPU
   IR/GPUDialect.cpp
   IR/DialectRegistration.cpp
   Transforms/KernelOutlining.cpp
+  Transforms/MemoryPromotion.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
 )
-add_dependencies(MLIRGPU MLIRGPUOpsIncGen MLIRIR MLIRLLVMIR LLVMSupport)
-target_link_libraries(MLIRGPU MLIRIR MLIRLLVMIR MLIRStandardOps LLVMSupport)
+add_dependencies(MLIRGPU
+  MLIRGPUOpsIncGen
+  MLIREDSC
+  MLIRIR
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRSupport
+  MLIRTransformUtils
+  LLVMSupport)
+target_link_libraries(MLIRGPU
+  MLIREDSC
+  MLIRIR
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRSupport
+  MLIRTransformUtils
+  LLVMSupport)
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index bda8032..32d7fae 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -593,6 +593,24 @@ LogicalResult verify(LaunchFuncOp op) {
 // GPUFuncOp
 //===----------------------------------------------------------------------===//
 
+/// Adds a workgroup attribution to "op" of the MemRef type with the given shape
+/// and element type.
+Value GPUFuncOp::addWorkgroupAttribution(ArrayRef<int64_t> shape,
+                                         Type elementType) {
+  unsigned pos = getNumFuncArguments() + getNumWorkgroupAttributions();
+  Block &bodyBlock = body().front();
+  Value attribution = bodyBlock.insertArgument(
+      std::next(bodyBlock.args_begin(), pos),
+      MemRefType::get(shape, elementType, /*affineMapComposition=*/{},
+                      GPUDialect::getWorkgroupAddressSpace()));
+  auto numWorkgroupBuffersAttr =
+      getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName());
+  setAttr(getNumWorkgroupAttributionsAttrName(),
+          IntegerAttr::get(numWorkgroupBuffersAttr.getType(),
+                           numWorkgroupBuffersAttr.getValue() + 1));
+  return attribution;
+}
+
 void GPUFuncOp::build(Builder *builder, OperationState &result, StringRef name,
                       FunctionType type, ArrayRef<Type> workgroupAttributions,
                       ArrayRef<Type> privateAttributions,
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
new file mode 100644
index 0000000..f01a430
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -0,0 +1,173 @@
+//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities that allow one to create IR moving the data
+// across different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/LoopUtils.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+/// Returns the textual name of a GPU dimension.
+static StringRef getDimName(unsigned dim) {
+  if (dim == 0)
+    return "x";
+  if (dim == 1)
+    return "y";
+  if (dim == 2)
+    return "z";
+
+  llvm_unreachable("dimension ID overflow");
+}
+
+/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
+/// values using the bounds derived from the "from" value. Emits at least
+/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
+/// single-iteration loops. Maps the innermost loops to thread dimensions, in
+/// reverse order to enable access coalescing in the innermost loop.
+static void insertCopyLoops(OpBuilder &builder, Location loc,
+                            edsc::MemRefView &bounds, Value from, Value to) {
+  // Create EDSC handles for bounds.
+  unsigned rank = bounds.rank();
+  SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;
+
+  // Make sure we have enough loops to use all thread dimensions, these trivial
+  // loops should be outermost and therefore inserted first.
+  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
+    unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
+    edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
+    edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
+    lbs.resize(extraLoops, zero);
+    ubs.resize(extraLoops, one);
+    steps.resize(extraLoops, one);
+  }
+
+  // Add existing bonuds.
+  lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
+  ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
+
+  // Emit constant operations for steps.
+  steps.reserve(lbs.size());
+  llvm::transform(
+      bounds.getSteps(), std::back_inserter(steps),
+      [](int64_t step) { return edsc::intrinsics::constant_index(step); });
+
+  // Obtain thread identifiers and block sizes, necessary to map to them.
+  auto indexType = builder.getIndexType();
+  SmallVector<Value, 3> threadIds, blockDims;
+  for (unsigned i = 0; i < 3; ++i) {
+    auto dimName = builder.getStringAttr(getDimName(i));
+    threadIds.push_back(
+        builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
+    blockDims.push_back(
+        builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
+  }
+
+  // Produce the loop nest with copies.
+  auto ivs = edsc::makeIndexHandles(lbs.size());
+  auto ivPtrs =
+      edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
+  edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
+    auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
+    edsc::StdIndexedValue fromHandle(from), toHandle(to);
+    toHandle(activeIvs) = fromHandle(activeIvs);
+  });
+
+  // Map the innermost loops to threads in reverse order.
+  for (auto en :
+       llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
+           GPUDialect::getNumWorkgroupDimensions())))) {
+    auto loop = cast<loop::ForOp>(
+        en.value().getValue().getParentRegion()->getParentOp());
+    mapLoopToProcessorIds(loop, {threadIds[en.index()]},
+                          {blockDims[en.index()]});
+  }
+}
+
+/// Emits the loop nests performing the copy to the designated location in the
+/// beginning of the region, and from the designated location immediately before
+/// the terminator of the first block of the region. The region is expected to
+/// have one block. This boils down to the following structure
+///
+///   ^bb(...):
+///     <loop-bound-computation>
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %0 = load %from[%arg0, ..., %argN]
+///           store %0, %to[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///     gpu.barrier
+///     <... original body ...>
+///     gpu.barrier
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %1 = load %to[%arg0, ..., %argN]
+///           store %1, %from[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///
+/// Inserts the barriers unconditionally since different threads may be copying
+/// values and reading them. An analysis would be required to eliminate barriers
+/// in case where value is only used by the thread that copies it. Both copies
+/// are inserted unconditionally, an analysis would be required to only copy
+/// live-in and live-out values when necessary. This copies the entire memref
+/// pointed to by "from". In case a smaller block would be sufficient, the
+/// caller can create a subview of the memref and promote it instead.
+static void insertCopies(Region &region, Location loc, Value from, Value to) {
+  auto fromType = from.getType().cast<MemRefType>();
+  auto toType = to.getType().cast<MemRefType>();
+  (void)fromType;
+  (void)toType;
+  assert(fromType.getShape() == toType.getShape());
+  assert(fromType.getRank() != 0);
+  assert(has_single_element(region) &&
+         "unstructured control flow not supported");
+
+  OpBuilder builder(region.getContext());
+  builder.setInsertionPointToStart(&region.front());
+
+  edsc::ScopedContext edscContext(builder, loc);
+  edsc::MemRefView fromView(from);
+  insertCopyLoops(builder, loc, fromView, from, to);
+  builder.create<gpu::BarrierOp>(loc);
+
+  builder.setInsertionPoint(&region.front().back());
+  builder.create<gpu::BarrierOp>(loc);
+  insertCopyLoops(builder, loc, fromView, to, from);
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
+  Value value = op.getArgument(arg);
+  auto type = value.getType().dyn_cast<MemRefType>();
+  assert(type && type.hasStaticShape() && "can only promote memrefs");
+
+  Value attribution =
+      op.addWorkgroupAttribution(type.getShape(), type.getElementType());
+
+  // Replace the uses first since only the original uses are currently present.
+  // Then insert the copies.
+  value.replaceAllUsesWith(attribution);
+  insertCopies(op.getBody(), op.getLoc(), value, attribution);
+}
diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp
index b0ada99..2757c50 100644
--- a/mlir/lib/IR/Block.cpp
+++ b/mlir/lib/IR/Block.cpp
@@ -179,6 +179,20 @@ void Block::eraseArgument(unsigned index, bool updatePredTerms) {
   }
 }
 
+/// Insert one value to the given position of the argument list. The existing
+/// arguments are shifted. The block is expected not to have predecessors.
+BlockArgument Block::insertArgument(args_iterator it, Type type) {
+  assert(llvm::empty(getPredecessors()) &&
+         "cannot insert arguments to blocks with predecessors");
+
+  // Use the args_iterator (on the BlockArgListType) to compute the insertion
+  // iterator in the underlying argument storage.
+  size_t distance = std::distance(args_begin(), it);
+  auto arg = BlockArgument::create(type, this);
+  arguments.insert(std::next(arguments.begin(), distance), arg);
+  return arg;
+}
+
 //===----------------------------------------------------------------------===//
 // Terminator management
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/promotion.mlir b/mlir/test/Dialect/GPU/promotion.mlir
new file mode 100644
index 0000000..c06174e
--- /dev/null
+++ b/mlir/test/Dialect/GPU/promotion.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt -test-gpu-memory-promotion -split-input-file %s | FileCheck %s
+
+module @foo attributes {gpu.kernel_module} {
+  // Verify that the attribution was indeed introduced
+  // CHECK-LABEL: @memref3d
+  // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32>
+  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>)
+  gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel {
+    // Verify that loop bounds are emitted, the order does not matter.
+    // CHECK-DAG: %[[c1:.*]] = constant 1
+    // CHECK-DAG: %[[c4:.*]] = constant 4
+    // CHECK-DAG: %[[c5:.*]] = constant 5
+    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+    // Verify that loops for the copy are emitted. We only check the number of
+    // loops here since their bounds are produced by mapLoopToProcessorIds,
+    // tested separately.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+
+    // Verify that the copy is emitted and uses only the last two loops.
+    // CHECK:       %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]]
+    // CHECK:       store %[[v]], %[[promoted]][%[[i1]], %[[i2]]]
+
+    // Verify that the use has been rewritten.
+    // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>)
+    "use"(%arg0) : (memref<5x4xf32>) -> ()
+
+
+    // Verify that loops for the copy are emitted. We only check the number of
+    // loops here since their bounds are produced by mapLoopToProcessorIds,
+    // tested separately.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+
+    // Verify that the copy is emitted and uses only the last two loops.
+    // CHECK:       %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]]
+    // CHECK:       store %[[v]], %[[arg]][%[[i1]], %[[i2]]]
+    gpu.return
+  }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+  // Verify that the attribution was indeed introduced
+  // CHECK-LABEL: @memref5d
+  // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32>
+  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>)
+  gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel {
+    // Verify that loop bounds are emitted, the order does not matter.
+    // CHECK-DAG: %[[c0:.*]] = constant 0
+    // CHECK-DAG: %[[c1:.*]] = constant 1
+    // CHECK-DAG: %[[c4:.*]] = constant 4
+    // CHECK-DAG: %[[c5:.*]] = constant 5
+    // CHECK-DAG: %[[c6:.*]] = constant 6
+    // CHECK-DAG: %[[c7:.*]] = constant 7
+    // CHECK-DAG: %[[c8:.*]] = constant 8
+    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+    // Verify that loops for the copy are emitted.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+    // CHECK:       loop.for %[[i3:.*]] =
+    // CHECK:         loop.for %[[i4:.*]] =
+
+    // Verify that the copy is emitted.
+    // CHECK:           %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    // CHECK:           store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+
+    // Verify that the use has been rewritten.
+    // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>)
+    "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> ()
+
+    // Verify that loop loops for the copy are emitted.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+    // CHECK:       loop.for %[[i3:.*]] =
+    // CHECK:         loop.for %[[i4:.*]] =
+
+    // Verify that the copy is emitted.
+    // CHECK:           %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    // CHECK:           store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    gpu.return
+  }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+  // Check that attribution insertion works fine.
+  // CHECK-LABEL: @insert
+  // CHECK-SAME: (%{{.*}}: memref<4xf32>
+  // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3>
+  // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>)
+  // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>)
+  gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup})
+      workgroup(%arg1: memref<1x1xf64, 3>)
+      private(%arg2: memref<1x1xi64, 5>)
+      kernel {
+    // CHECK: "use"(%[[wg2]])
+    "use"(%arg0) : (memref<4xf32>) -> ()
+    gpu.return
+  }
+}
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index b6338e1..ac4a493 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(MLIRTestTransforms
   TestCallGraph.cpp
   TestConstantFold.cpp
   TestLoopFusion.cpp
+  TestGpuMemoryPromotion.cpp
   TestInlining.cpp
   TestLinalgTransforms.cpp
   TestLiveness.cpp
@@ -26,6 +27,8 @@ add_dependencies(MLIRTestTransforms MLIRTestVectorTransformPatternsIncGen)
 target_link_libraries(MLIRTestTransforms
   MLIRAffineOps
   MLIRAnalysis
+  MLIREDSC
+  MLIRGPU
   MLIRLoopOps
   MLIRPass
   MLIRTestDialect
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
new file mode 100644
index 0000000..ee02918
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -0,0 +1,40 @@
+//===- TestGPUMemoryPromotionPass.cpp - Test pass for GPU promotion -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for moving data across
+// different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the promotion to workgroup memory in GPU functions.
+/// Promotes all arguments with "gpu.test_promote_workgroup" attribute. This
+/// does not check whether the promotion is legal (e.g., amount of memory used)
+/// or beneficial (e.g., makes previously uncoalesced loads coalesced).
+class TestGpuMemoryPromotionPass
+    : public OperationPass<TestGpuMemoryPromotionPass, gpu::GPUFuncOp> {
+  void runOnOperation() override {
+    gpu::GPUFuncOp op = getOperation();
+    for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
+      if (op.getArgAttrOfType<UnitAttr>(i, "gpu.test_promote_workgroup"))
+        promoteToWorkgroupMemory(op, i);
+    }
+  }
+};
+} // end namespace
+
+static PassRegistration<TestGpuMemoryPromotionPass> registration(
+    "test-gpu-memory-promotion",
+    "Promotes the annotated arguments of gpu.func to workgroup memory.");
-- 
2.7.4