From 9d03f5674f4e511d834b3de9d24eb1248a06f864 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Tue, 9 Jul 2019 06:37:17 -0700
Subject: [PATCH] Implement parametric tiling on standard for loops

Parametric tiling can be used to extract outer loops with fixed number of
iterations.  This in turn enables mapping to GPU kernels on a fixed grid
independently of the range of the original loops, which may be unknown
statically, making the kernel adaptable to different sizes.  Provide a utility
function that also computes the parametric tile size given the range of the
loop.  Exercise the utility function through a simple pass that applies it to
all top-level loop nests.  Permutability or parallelism checks must be
performed before calling this utility function in actual passes.

Note that parametric tiling cannot be implemented in a purely affine way,
although it can be encoded using semi-affine maps.  The choice to implement it
on standard loops is guided by them being the common representation between
Affine loops, Linalg and GPU kernels.

PiperOrigin-RevId: 257180251
---
 mlir/include/mlir/IR/OpDefinition.h          |   4 +-
 mlir/include/mlir/Pass/PassRegistry.h        |  13 +-
 mlir/include/mlir/StandardOps/Ops.td         |   2 +
 mlir/include/mlir/Transforms/LoopUtils.h     |  12 ++
 mlir/include/mlir/Transforms/Passes.h        |   6 +
 mlir/lib/Transforms/LoopParametricTiling.cpp |  73 +++++++++++
 mlir/lib/Transforms/Utils/LoopUtils.cpp      | 173 +++++++++++++++++++++++++--
 mlir/test/Transforms/parametric_tiling.mlir  |  66 ++++++++++
 8 files changed, 336 insertions(+), 13 deletions(-)
 create mode 100644 mlir/lib/Transforms/LoopParametricTiling.cpp
 create mode 100644 mlir/test/Transforms/parametric_tiling.mlir
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
index 6913b76..32f7efa 100644
--- a/mlir/include/mlir/IR/OpDefinition.h
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -798,7 +798,9 @@ public:
   Dialect *getDialect() { return getOperation()->getDialect(); }
 
   /// Return the Region enclosing this Op.
-  Region *getContainingRegion() { return getOperation()->getParentRegion(); }
+  Region *getContainingRegion() {
+    return getOperation()->getContainingRegion();
+  }
 
   /// Return true if this "op class" can match against the specified operation.
   /// This hook can be overridden with a more specific implementation in
diff --git a/mlir/include/mlir/Pass/PassRegistry.h b/mlir/include/mlir/Pass/PassRegistry.h
index 1708818..27096f3 100644
--- a/mlir/include/mlir/Pass/PassRegistry.h
+++ b/mlir/include/mlir/Pass/PassRegistry.h
@@ -105,16 +105,21 @@ void registerPass(StringRef arg, StringRef description, const PassID *passID,
                   const PassAllocatorFunction &function);
 
 /// PassRegistration provides a global initializer that registers a Pass
-/// allocation routine for a concrete pass instance.
+/// allocation routine for a concrete pass instance.  The third argument is
+/// optional and provides a callback to construct a pass that does not have
+/// a default constructor.
 ///
 /// Usage:
 ///
 ///   // At namespace scope.
 ///   static PassRegistration<MyPass> Unused("unused", "Unused pass");
 template <typename ConcretePass> struct PassRegistration {
-  PassRegistration(StringRef arg, StringRef description) {
-    registerPass(arg, description, PassID::getID<ConcretePass>(),
-                 [] { return new ConcretePass(); });
+  PassRegistration(
+      StringRef arg, StringRef description,
+      const PassAllocatorFunction &constructor = [] {
+        return new ConcretePass();
+      }) {
+    registerPass(arg, description, PassID::getID<ConcretePass>(), constructor);
   }
 };
 
diff --git a/mlir/include/mlir/StandardOps/Ops.td b/mlir/include/mlir/StandardOps/Ops.td
index 189d3c8..8e37a58 100644
--- a/mlir/include/mlir/StandardOps/Ops.td
+++ b/mlir/include/mlir/StandardOps/Ops.td
@@ -697,6 +697,8 @@ def ForOp : Std_Op<"for"> {
     OpBuilder getBodyBuilder() {
       return OpBuilder(body(), std::prev(body()->end()));
     }
+    void setLowerBound(Value *bound) { getOperation()->setOperand(0, bound); }
+    void setUpperBound(Value *bound) { getOperation()->setOperand(1, bound); }
   }];
 }
 
diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
index 1e46f23..6545558 100644
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -30,6 +30,7 @@
 namespace mlir {
 class AffineMap;
 class AffineForOp;
+class ForOp;
 class FuncOp;
 using Function = FuncOp;
 class OpBuilder;
@@ -140,6 +141,17 @@ SmallVector<SmallVector<AffineForOp, 8>, 8> tile(ArrayRef<AffineForOp> forOps,
 SmallVector<AffineForOp, 8> tile(ArrayRef<AffineForOp> forOps,
                                  ArrayRef<uint64_t> sizes, AffineForOp target);
 
+/// Tile a nest of standard for loops rooted at `rootForOp` with the given
+/// (parametric) sizes. Sizes are expected to be strictly positive values at
+/// runtime.  If more sizes than loops provided, discard the trailing values in
+/// sizes.  Assumes the loop nest is permutable.
+void tile(ForOp rootForOp, ArrayRef<Value *> sizes);
+
+/// Tile a nest of standard for loops rooted at `rootForOp` by finding such
+/// parametric tile sizes that the outer loops have a fixed number of iterations
+/// as defined in `sizes`.
+void extractFixedOuterLoops(ForOp rootFOrOp, ArrayRef<int64_t> sizes);
+
 } // end namespace mlir
 
 #endif // MLIR_TRANSFORMS_LOOP_UTILS_H
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index a253871..83532f2 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -99,6 +99,12 @@ FunctionPassBase *createLowerAffinePass();
 /// Creates a pass to perform tiling on loop nests.
 FunctionPassBase *createLoopTilingPass(uint64_t cacheSizeBytes);
 
+/// Creates a pass that performs parametric tiling so that the outermost loops
+/// have the given fixed number of iterations.  Assumes outermost loop nests
+/// are permutable.
+FunctionPassBase *
+createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes);
+
 /// Promotes all accessed memref regions to the specified faster memory space
 /// while generating DMAs to move data.
 FunctionPassBase *createDmaGenerationPass(
diff --git a/mlir/lib/Transforms/LoopParametricTiling.cpp b/mlir/lib/Transforms/LoopParametricTiling.cpp
new file mode 100644
index 0000000..c2b2394
--- /dev/null
+++ b/mlir/lib/Transforms/LoopParametricTiling.cpp
@@ -0,0 +1,73 @@
+//===- LoopParametricTiling.cpp --- Parametric loop tiling pass -----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to parametrically tile nests of standard loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/StandardOps/Ops.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/IR/Builders.h"
+
+using namespace mlir;
+
+static llvm::cl::list<int> clOuterLoopSizes(
+    "outer-loop-sizes", llvm::cl::MiscFlags::CommaSeparated,
+    llvm::cl::desc(
+        "fixed number of iterations that the outer loops should have"));
+
+namespace {
+// Extracts fixed-range loops for top-level loop nests with ranges defined in
+// the pass constructor.  Assumes loops are permutable.
+class SimpleParametricLoopTilingPass
+    : public FunctionPass<SimpleParametricLoopTilingPass> {
+public:
+  explicit SimpleParametricLoopTilingPass(ArrayRef<int64_t> outerLoopSizes)
+      : sizes(outerLoopSizes.begin(), outerLoopSizes.end()) {}
+
+  void runOnFunction() override {
+    Function func = getFunction();
+
+    func.walk<ForOp>([this](ForOp op) {
+      // Ignore nested loops.
+      if (op.getContainingRegion()->getParentOfType<ForOp>())
+        return;
+      extractFixedOuterLoops(op, sizes);
+    });
+  }
+
+  SmallVector<int64_t, 4> sizes;
+};
+} // end namespace
+
+FunctionPassBase *
+mlir::createSimpleParametricTilingPass(ArrayRef<int64_t> outerLoopSizes) {
+  return new SimpleParametricLoopTilingPass(outerLoopSizes);
+}
+
+static PassRegistration<SimpleParametricLoopTilingPass>
+    reg("extract-fixed-outer-loops",
+        "apply parametric tiling to the outer loops so that the ranges of "
+        "outer loops become static",
+        [] {
+          auto *pass = new SimpleParametricLoopTilingPass({});
+          pass->sizes.assign(clOuterLoopSizes.begin(), clOuterLoopSizes.end());
+          return pass;
+        });
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 65847fc..8fbd59b 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -351,20 +351,36 @@ LogicalResult mlir::instBodySkew(AffineForOp forOp, ArrayRef<uint64_t> shifts,
   return success();
 }
 
+// Collect perfectly nested loops starting from `rootForOps`.  Loops are
+// perfectly nested if each loop is the first and only non-terminator operation
+// in the parent loop.  Collect at most `maxLoops` loops and append them to
+// `forOps`.
+template <typename T>
+void getPerfectlyNestedLoopsImpl(
+    SmallVectorImpl<T> &forOps, T rootForOp,
+    unsigned maxLoops = std::numeric_limits<unsigned>::max()) {
+  for (unsigned i = 0; i < maxLoops; ++i) {
+    forOps.push_back(rootForOp);
+    // FIXME: ForOp and AffineForOp currently provide different names to access
+    // the region ("region" and "getRegion").  Remove this generic access when
+    // AffineForOp moves to ODS and also gets "region".
+    Block &body = rootForOp.getOperation()->getRegion(0).front();
+    if (body.begin() != std::prev(body.end(), 2))
+      return;
+
+    rootForOp = dyn_cast<T>(&body.front());
+    if (!rootForOp)
+      return;
+  }
+}
+
 /// Get perfectly nested sequence of loops starting at root of loop nest
 /// (the first op being another AffineFor, and the second op - a terminator).
 /// A loop is perfectly nested iff: the first op in the loop's body is another
 /// AffineForOp, and the second op is a terminator).
 void mlir::getPerfectlyNestedLoops(SmallVectorImpl<AffineForOp> &nestedLoops,
                                    AffineForOp root) {
-  AffineForOp curr = root;
-  nestedLoops.push_back(curr);
-  auto *currBody = curr.getBody();
-  while (currBody->begin() == std::prev(currBody->end(), 2) &&
-         (curr = dyn_cast<AffineForOp>(curr.getBody()->front()))) {
-    nestedLoops.push_back(curr);
-    currBody = curr.getBody();
-  }
+  getPerfectlyNestedLoopsImpl(nestedLoops, root);
 }
 
 /// Unrolls this loop completely.
@@ -762,3 +778,144 @@ SmallVector<AffineForOp, 8> mlir::tile(ArrayRef<AffineForOp> forOps,
                                        AffineForOp target) {
   return tile(forOps, sizes, ArrayRef<AffineForOp>{target})[0];
 }
+
+// Tile the given nest of standard for loops with the given (parametric) sizes.
+// Sizes are expected to be strictly positive values at runtime.  If more
+// sizes than loops provided, discard the trailing values in sizes.  When
+// applied to a loop nest
+//    for %i_0 = %lb_0 to %ub_0 step %s_0 {
+//      for %i_1 = %lb_1 to %ub_1 step %s_1 {
+//        "op"(%i0, %i1) : (index, index) -> () }}
+// this splits the loops into tile loops with step %sj * sizes[j] and the
+// original bounds, and the point loops iteration from %i_j to
+// min(%i_j + %s_j * sizes[j], %ub_j) with the original step.  No verification
+// of `forOps` being suitable for tiling is performed, this function only
+// applies the transformation.
+static void tile(MutableArrayRef<ForOp> forOps, ArrayRef<Value *> sizes) {
+  assert(sizes.size() >= forOps.size() && "insufficient number of tile sizes");
+  if (sizes.empty() || forOps.empty())
+    return;
+
+  ForOp rootForOp = forOps.front();
+  OpBuilder builder(rootForOp);
+
+  // Compute new steps for the outer loops.
+  SmallVector<Value *, 4> newSteps;
+  newSteps.reserve(sizes.size());
+  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
+    auto op = forOps[i];
+    Value *newStep = builder.create<MulIOp>(op.getLoc(), op.step(), sizes[i]);
+    newSteps.push_back(newStep);
+  }
+
+  // Create new outer loops nested one into another.
+  SmallVector<ForOp, 4> outerForOps;
+  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
+    auto outerForOp =
+        builder.create<ForOp>(forOps[i].getLoc(), forOps[i].lowerBound(),
+                              forOps[i].upperBound(), newSteps[i]);
+
+    builder.setInsertionPointToStart(outerForOp.body());
+
+    // FIXME: builder should do this for us.
+    ensureStdTerminator(outerForOp.getOperation()->getRegion(0), builder,
+                        forOps[i].getLoc());
+    outerForOp.body()->addArgument(builder.getIndexType());
+    builder.setInsertionPointToStart(outerForOp.body());
+
+    outerForOps.push_back(outerForOp);
+  }
+
+  // Move the outermost original loop into the innermost new outer loop.  Thus
+  // the body of the original loops does not need updating.
+  auto lastOuterForOp = outerForOps.back();
+  lastOuterForOp.body()->getOperations().splice(
+      lastOuterForOp.body()->getOperations().begin(),
+      rootForOp.getOperation()->getBlock()->getOperations(),
+      rootForOp.getOperation());
+
+  // Immediately before the (now sunk) outermost original loop, insert the
+  // computation of the upper bounds of the inner loops.  Update the bounds of
+  // the orginial loops to make them point loops.
+  builder.setInsertionPointToStart(lastOuterForOp.body());
+  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
+    Value *stepped = builder.create<AddIOp>(
+        forOps[i].getLoc(), outerForOps[i].getInductionVar(), newSteps[i]);
+    Value *less = builder.create<CmpIOp>(forOps[i].getLoc(), CmpIPredicate::SLT,
+                                         forOps[i].upperBound(), stepped);
+    Value *upperBound = builder.create<SelectOp>(
+        forOps[i].getLoc(), less, forOps[i].upperBound(), stepped);
+    forOps[i].setLowerBound(outerForOps[i].getInductionVar());
+    forOps[i].setUpperBound(upperBound);
+  }
+}
+
+void mlir::tile(ForOp rootForOp, ArrayRef<Value *> sizes) {
+  // Collect prefectly nested loops.  If more size values provided than nested
+  // loops available, truncate `sizes`.
+  SmallVector<ForOp, 4> forOps;
+  forOps.reserve(sizes.size());
+  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
+  if (forOps.size() < sizes.size())
+    sizes = sizes.take_front(forOps.size());
+
+  return ::tile(forOps, sizes);
+}
+
+// Build the IR that performs ceil division of a positive value by a constant:
+//    ceildiv(a, B) = divis(a + (B-1), B)
+// where divis is roundning-to-zero division.
+static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
+                              int64_t divisor) {
+  assert(divisor > 0 && "expected positive divisor");
+  assert(dividend->getType().isIndex() && "expected index-typed value");
+
+  Value *divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1);
+  Value *divisorCst = builder.create<ConstantIndexOp>(loc, divisor);
+  Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst);
+  return builder.create<DivISOp>(loc, sum, divisorCst);
+}
+
+static Value *ceilDivPositive(OpBuilder &builder, Location loc, Value *dividend,
+                              Value *divisor) {
+  assert(dividend->getType().isIndex() && "expected index-typed value");
+
+  Value *cstOne = builder.create<ConstantIndexOp>(loc, 1);
+  Value *divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne);
+  Value *sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne);
+  return builder.create<DivISOp>(loc, sum, divisor);
+}
+
+void mlir::extractFixedOuterLoops(ForOp rootForOp, ArrayRef<int64_t> sizes) {
+  // Collect prefectly nested loops.  If more size values provided than nested
+  // loops available, truncate `sizes`.
+  SmallVector<ForOp, 4> forOps;
+  forOps.reserve(sizes.size());
+  getPerfectlyNestedLoopsImpl(forOps, rootForOp, sizes.size());
+  if (forOps.size() < sizes.size())
+    sizes = sizes.take_front(forOps.size());
+
+  OpBuilder builder(rootForOp);
+  auto loc = rootForOp.getLoc();
+
+  // Compute the tile sizes such that i-th outer loop executes size[i]
+  // iterations.  Given that the loop current executes
+  //   numIterations = ceildiv((upperBound - lowerBound), step)
+  // iterations, we need to tile with size ceildiv(numIterations, size[i]).
+  SmallVector<Value *, 4> tileSizes;
+  tileSizes.reserve(sizes.size());
+  for (unsigned i = 0, e = sizes.size(); i < e; ++i) {
+    assert(sizes[i] > 0 && "expected strictly positive size for strip-mining");
+
+    auto forOp = forOps[i];
+    Value *diff =
+        builder.create<SubIOp>(loc, forOp.upperBound(), forOp.lowerBound());
+    Value *numIterations = ceilDivPositive(builder, loc, diff, forOp.step());
+    Value *iterationsPerBlock =
+        ceilDivPositive(builder, loc, numIterations, sizes[i]);
+    tileSizes.push_back(iterationsPerBlock);
+  }
+
+  // Call parametric tiling with the given sizes.
+  return ::tile(forOps, tileSizes);
+}
diff --git a/mlir/test/Transforms/parametric_tiling.mlir b/mlir/test/Transforms/parametric_tiling.mlir
new file mode 100644
index 0000000..201c9c5
--- /dev/null
+++ b/mlir/test/Transforms/parametric_tiling.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7 %s | FileCheck %s --check-prefixes=COMMON,TILE_7
+// RUN: mlir-opt -extract-fixed-outer-loops -outer-loop-sizes=7,4 %s | FileCheck %s --check-prefixes=COMMON,TILE_74
+
+// COMMON-LABEL: @foo
+func @foo(%arg0: memref<?x?xf32>) {
+  %c2 = constant 2 : index
+  %c44 = constant 44 : index
+  %c1 = constant 1 : index
+  // Range of the original loop:
+  //   (upper - lower + step - 1) / step
+  // where step is known to be %c1.
+  // COMMON:      %[[diff:.*]] = subi %c44, %c2
+  // COMMON:      %[[adjustment:.*]] = subi %c1, %c1_{{.*}}
+  // COMMON-NEXT: %[[diff_adj:.*]] = addi %[[diff]], %[[adjustment]]
+  // COMMON-NEXT: %[[range:.*]] = divis %[[diff_adj]], %c1
+
+  // Ceildiv to get the parametric tile size.
+  // COMMON:       %[[sum:.*]] = addi %[[range]], %c6
+  // COMMON-NEXT:  %[[size:.*]] = divis %[[sum]], %c7
+
+  // Range of the second original loop
+  //   (upper - lower + step - 1) / step
+  // where step is known to be %c2.
+  // TILE_74:      %[[diff2:.*]] = subi %c44, %c1
+  // TILE_74:      %[[adjustment2:.*]] = subi %c2, %c1_{{.*}}
+  // TILE_74-NEXT: %[[diff2_adj:.*]] = addi %[[diff2]], %[[adjustment2]]
+  // TILE_74-NEXT: %[[range2:.*]] = divis %[[diff2_adj]], %c2
+
+  // Ceildiv to get the parametric tile size for the second original loop.
+  // TILE_74:      %[[sum2:.*]] = addi %[[range2]], %c3
+  // TILE_74-NEXT: %[[size2:.*]] = divis %[[sum2]], %c4
+
+  // New step(s) (original is %c1 and %c2).
+  // COMMON:      %[[step:.*]] = muli %c1, %[[size]]
+  // TILE_74:     %[[step2:.*]] = muli %c2, %[[size2]]
+
+  // Updated outer loop(s) use new steps.
+  // COMMON:  for %[[i:.*]] = %c2 to %c44 step %[[step]]
+  // TILE_74: for %[[j:.*]] = %c1 to %c44 step %[[step2]]
+  for %i = %c2 to %c44 step %c1 {
+    // Upper bound for the inner loop min(%i + %step, %c44).
+    // COMMON:      %[[stepped:.*]] = addi %[[i]], %[[step]]
+    // COMMON-NEXT: cmpi "slt", %c44, %[[stepped]]
+    // COMMON-NEXT: %[[ub:.*]] = select {{.*}}, %c44, %[[stepped]]
+    //
+    // TILE_74:      %[[stepped2:.*]] = addi %[[j]], %[[step2]]
+    // TILE_74-NEXT: cmpi "slt", %c44, %[[stepped2]]
+    // TILE_74-NEXT: %[[ub2:.*]] = select {{.*}}, %c44, %[[stepped2]]
+
+    // Created inner loop.
+    // COMMON: for %[[ii:.*]] = %[[i]] to %[[ub:.*]] step %c1
+
+    // This loop is not modified in TILE_7 case.
+    // TILE_7:  for %[[j:.*]] = %c1 to %c44 step %c2
+    //
+    // But is modified in TILE_74 case.
+    // TILE_74: for %[[jj:.*]] = %[[j]] to %[[ub2]] step %c2
+    for %j = %c1 to %c44 step %c2 {
+      // The right iterator are used.
+      // TILE_7:  load %arg0[%[[ii]], %[[j]]]
+      // TILE_74: load %arg0[%[[ii]], %[[jj]]]
+      load %arg0[%i, %j]: memref<?x?xf32>
+    }
+  }
+  return
+}
-- 
2.7.4