From 624fccba87cc7cc5af523f7efb10a0d82118ecd5 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Thu, 18 Feb 2021 09:41:40 +0100
Subject: [PATCH] [mlir] Add `linalg.tiled_loop` op.

`subtensor_insert` was used instead of `linalg.subtensor_yield` to make this PR
smaller. Verification will be added in a follow-up PR.

Differential Revision: https://reviews.llvm.org/D96943
---
 mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td |  55 +++++++++
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp         | 148 +++++++++++++++++++++++
 mlir/test/Dialect/Linalg/roundtrip.mlir          | 107 ++++++++++++++++
 3 files changed, 310 insertions(+)
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
index dc99e21..dfc4929 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -15,6 +15,7 @@
 
 include "mlir/Dialect/Linalg/IR/LinalgBase.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 
@@ -485,4 +486,58 @@ def Linalg_YieldOp : Linalg_Op<"yield", [NoSideEffect, ReturnLike, Terminator]>,
   }];
 }
 
+def Linalg_TiledLoopOp : Linalg_Op<"tiled_loop", [
+     AttrSizedOperandSegments,
+     DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+     RecursiveSideEffects,
+     SingleBlockImplicitTerminator<"linalg::YieldOp">
+    ]> {
+  let summary = "Linalg tiled loop operation";
+  let description = [{
+    This is a loop-like operation with additional properties. The arguments
+    also include the input and the output tensors and the attributes to specify
+    the iterator types. The body region of the loop contains `subtensor`
+    operations applied to every tensor argument of TiledLoopOp.
+
+    The body region must contain exactly one block that terminates with
+    `linalg.yield` with the operands resulting from `subtensor_insert`
+    operations.
+
+    Parsing TiledLoopOp will set all elements of the `iterator_types` attribute
+    to "parallel" type, when it is absent from the custom format.
+
+    Example:
+
+    ```mlir
+      linalg.tiled_loop (%i) = (%c0) to (%c24) step (%c4)
+        ins(%lhs, %rhs : tensor<24x64xi8>, tensor<24x64xi8>)
+        outs(%out : tensor<24x64xi8>)
+        iterators("parallel") {
+      %lhs_sub = subtensor %lhs[%i, 0] [%c4, %c64] [1, 1]
+          : tensor<24x64xi8> to tensor<?x?xi8>
+      %rhs_sub = subtensor %rhs[%i, 0] [%c4, %c64] [1, 1]
+          : tensor<24x64xi8> to tensor<?x?xi8>
+      %out_sub = subtensor %out[%i, 0] [%c4, %c64] [1, 1]
+          : tensor<24x64xi8> to tensor<?x?xi8>
+
+      %result_sub = linalg.generic ...
+
+      %result = subtensor_insert %result_sub into %out[%i, 0][%c4, %c64][1, 1]
+        : tensor<?x?xi8> into tensor<24x64xi8>
+      linalg.yield %result : tensor<24x64xi8>
+    }
+    ```
+  }];
+
+  let arguments = (ins Variadic<Index>:$lowerBound,
+                       Variadic<Index>:$upperBound,
+                       Variadic<Index>:$step,
+                       Variadic<AnyRankedTensor>:$inputs,
+                       Variadic<AnyRankedTensor>:$outputs,
+                       ArrayAttr:$iterator_types);
+  let results = (outs Variadic<AnyRankedTensor>:$results);
+  let regions = (region SizedRegion<1>:$region);
+}
+
+
 #endif // LINALG_OPS
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 7c34867..b176f58 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1704,9 +1704,157 @@ static LogicalResult verify(linalg::YieldOp op) {
     return success();
   }
 
+  if (auto tiledLoopOp = dyn_cast<linalg::TiledLoopOp>(parentOp)) {
+    return success();
+  }
   return op.emitOpError("expected parent op with LinalgOp interface");
 }
 
+//===----------------------------------------------------------------------===//
+// TiledLoopOp
+//===----------------------------------------------------------------------===//
+
+static void print(OpAsmPrinter &p, TiledLoopOp op) {
+  p << op.getOperationName() << " (" << op.getBody()->getArguments() << ") = ("
+    << op.lowerBound() << ") to (" << op.upperBound() << ") step (" << op.step()
+    << ")";
+
+  if (!op.inputs().empty())
+    p << " ins (" << op.inputs() << ")";
+  if (!op.outputs().empty())
+    p << " outs (" << op.outputs() << ")";
+
+  if (llvm::any_of(op.iterator_types(), [](Attribute attr) {
+        return attr.cast<StringAttr>().getValue() !=
+               getParallelIteratorTypeName();
+      })) {
+    p << " iterators(" << op.iterator_types() << ")";
+  }
+
+  p.printRegion(op.region(), /*printEntryBlockArgs=*/false);
+  p.printOptionalAttrDict(
+      op.getAttrs(), /*elidedAttrs=*/{TiledLoopOp::getOperandSegmentSizeAttr(),
+                                      getIteratorTypesAttrName()});
+}
+
+static ParseResult parseTiledLoopOp(OpAsmParser &parser,
+                                    OperationState &result) {
+  auto &builder = parser.getBuilder();
+  // Parse an opening `(` followed by induction variables followed by `)`
+  SmallVector<OpAsmParser::OperandType, 4> ivs;
+  if (parser.parseRegionArgumentList(ivs, /*requiredOperandCount=*/-1,
+                                     OpAsmParser::Delimiter::Paren))
+    return failure();
+
+  // Parse loop bounds.
+  SmallVector<OpAsmParser::OperandType, 4> lower;
+  if (parser.parseEqual() ||
+      parser.parseOperandList(lower, ivs.size(),
+                              OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(lower, builder.getIndexType(), result.operands))
+    return failure();
+
+  SmallVector<OpAsmParser::OperandType, 4> upper;
+  if (parser.parseKeyword("to") ||
+      parser.parseOperandList(upper, ivs.size(),
+                              OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(upper, builder.getIndexType(), result.operands))
+    return failure();
+
+  // Parse step values.
+  SmallVector<OpAsmParser::OperandType, 4> steps;
+  if (parser.parseKeyword("step") ||
+      parser.parseOperandList(steps, ivs.size(),
+                              OpAsmParser::Delimiter::Paren) ||
+      parser.resolveOperands(steps, builder.getIndexType(), result.operands))
+    return failure();
+
+  // Parse input tensors.
+  SmallVector<OpAsmParser::OperandType, 4> inputs;
+  if (succeeded(parser.parseOptionalKeyword("ins"))) {
+    SmallVector<Type, 4> inputTypes;
+    llvm::SMLoc inputsOperandsLoc = parser.getCurrentLocation();
+
+    if (parser.parseLParen() || parser.parseOperandList(inputs) ||
+        parser.parseColonTypeList(inputTypes) || parser.parseRParen())
+      return failure();
+
+    if (parser.resolveOperands(inputs, inputTypes, inputsOperandsLoc,
+                               result.operands))
+      return failure();
+  }
+
+  // Parse output tensors.
+  SmallVector<OpAsmParser::OperandType, 4> outputs;
+  if (succeeded(parser.parseOptionalKeyword("outs"))) {
+    SmallVector<Type, 4> outputTypes;
+    llvm::SMLoc outputsOperandsLoc = parser.getCurrentLocation();
+
+    if (parser.parseLParen() || parser.parseOperandList(outputs) ||
+        parser.parseColonTypeList(outputTypes) || parser.parseRParen())
+      return failure();
+
+    if (parser.resolveOperands(outputs, outputTypes, outputsOperandsLoc,
+                               result.operands))
+      return failure();
+    result.addTypes(outputTypes);
+  }
+
+  // Parse attributes.
+  SmallVector<Attribute, 4> iterTypes;
+  if (succeeded(parser.parseOptionalKeyword("iterators"))) {
+    StringAttr iterType;
+
+    if (parser.parseLParen() || parser.parseAttribute(iterType))
+      return failure();
+    iterTypes.push_back(iterType);
+    for (int i = 1, e = ivs.size(); i < e; ++i) {
+      if (parser.parseComma() || parser.parseAttribute(iterType))
+        return failure();
+      iterTypes.push_back(iterType);
+    }
+    if (parser.parseRParen())
+      return failure();
+  } else {
+    auto parallelIter = builder.getStringAttr(getParallelIteratorTypeName());
+    iterTypes = SmallVector<Attribute, 4>(ivs.size(), parallelIter);
+  }
+  result.addAttribute(getIteratorTypesAttrName(),
+                      builder.getArrayAttr(iterTypes));
+  result.addAttribute(
+      TiledLoopOp::getOperandSegmentSizeAttr(),
+      builder.getI32VectorAttr({static_cast<int32_t>(lower.size()),
+                                static_cast<int32_t>(upper.size()),
+                                static_cast<int32_t>(steps.size()),
+                                static_cast<int32_t>(inputs.size()),
+                                static_cast<int32_t>(outputs.size())}));
+
+  // Parse the body.
+  Region *body = result.addRegion();
+  SmallVector<Type, 4> types(ivs.size(), builder.getIndexType());
+  if (parser.parseRegion(*body, ivs, types))
+    return failure();
+
+  // Parse optional attributes.
+  parser.parseOptionalAttrDict(result.attributes);
+
+  return success();
+}
+
+Region &TiledLoopOp::getLoopBody() { return region(); }
+
+LogicalResult TiledLoopOp::moveOutOfLoop(ArrayRef<Operation *> ops) {
+  for (auto *op : ops)
+    op->moveBefore(*this);
+  return success();
+}
+
+bool TiledLoopOp::isDefinedOutsideOfLoop(Value value) {
+  return !region().isAncestor(value.getParentRegion());
+}
+
+static LogicalResult verify(TiledLoopOp op) { return success(); }
+
 /////// Operations corresponding to library calls defined with Tablegen ////////
 
 template <typename LinalgPoolingOp>
diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
index 32ea93b..cffafa5 100644
--- a/mlir/test/Dialect/Linalg/roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -794,3 +794,110 @@ func @fill_tensor(%arg0 : index, %arg1 : index, %arg2 : f32) -> tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
 // CHECK: %{{.+}} = linalg.fill(%{{.+}}, %{{.+}}) : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
+
+// -----
+
+#accesses = [
+  affine_map<(i, j) -> (i, j)>,
+  affine_map<(i, j) -> (i, j)>,
+  affine_map<(i, j) -> (i, j)>
+]
+
+#trait = {
+  indexing_maps = #accesses,
+  iterator_types = ["parallel", "parallel"]
+}
+
+func @tiled_loop(%lhs: tensor<24x64xi8>, %rhs: tensor<24x64xi8>,
+                 %out: tensor<24x64xi8>) -> tensor<24x64xi8> {
+ %c0 = constant 0 : index
+ %c1 = constant 1 : index
+ %c4 = constant 4 : index
+ %c24 = constant 24 : index
+ %c64 = constant 64 : index
+ %prod = linalg.tiled_loop (%i) = (%c0) to (%c24) step (%c4)
+      ins(%lhs, %rhs : tensor<24x64xi8>, tensor<24x64xi8>)
+      outs(%out : tensor<24x64xi8>) {
+    %lhs_sub = subtensor %lhs[%i, 0] [%c4, %c64] [1, 1]
+        : tensor<24x64xi8> to tensor<?x?xi8>
+    %rhs_sub = subtensor %rhs[%i, 0] [%c4, %c64] [1, 1]
+        : tensor<24x64xi8> to tensor<?x?xi8>
+    %out_sub = subtensor %out[%i, 0] [%c4, %c64] [1, 1]
+        : tensor<24x64xi8> to tensor<?x?xi8>
+
+    %sum = linalg.generic #trait
+        ins(%lhs_sub, %rhs_sub : tensor<?x?xi8>, tensor<?x?xi8>)
+        outs(%out_sub : tensor<?x?xi8>) {
+      ^bb(%l: i8, %r: i8, %o: i8) :
+        %s = addi %l, %r : i8
+        linalg.yield %s : i8
+      } -> tensor<?x?xi8>
+
+    %sum_sub = subtensor_insert %sum into %out[%i, 0][%c4, %c64][1, 1]
+      : tensor<?x?xi8> into tensor<24x64xi8>
+    linalg.yield %sum_sub : tensor<24x64xi8>
+  }
+  return %prod : tensor<24x64xi8>
+}
+// CHECK-LABEL: func @tiled_loop
+// CHECK-NOT: iterators(
+
+// -----
+
+#id_3d = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#id_2d = affine_map<(d0, d1, d2) -> (d0, d2)>
+#id_1d = affine_map<(d0, d1, d2) -> (d1)>
+
+#trait = {
+  indexing_maps = [
+    #id_3d,
+    #id_2d,
+    #id_1d,
+    #id_1d
+  ],
+  iterator_types = ["reduction", "parallel", "reduction"]
+}
+
+func @tiled_loop_reduction(%input_3d: tensor<16x24x32xf32>,
+                           %input_2d: tensor<16x32xf32>,
+                           %input_1d: tensor<24xf32>,
+                           %output: tensor<24xf32>) -> tensor<24xf32> {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %c4 = constant 4 : index
+  %c8 = constant 8 : index
+  %X = dim %input_3d, %c0 : tensor<16x24x32xf32>
+  %Y = dim %input_3d, %c1 : tensor<16x24x32xf32>
+  %Z = dim %input_3d, %c2 : tensor<16x24x32xf32>
+  %result = linalg.tiled_loop (%i, %j, %k)
+      = (%c0, %c0, %c0) to (%X, %Y, %Z) step (%c2, %c4, %c8)
+      ins(%input_3d, %input_2d: tensor<16x24x32xf32>, tensor<16x32xf32>)
+      outs( %output: tensor<24xf32>)
+      iterators("reduction", "parallel", "reduction") {
+    %sub_3d = subtensor %input_3d[%i, %j, %k][2, 4, 8][1, 1, 1]
+      : tensor<16x24x32xf32> to tensor<2x4x8xf32>
+    %sub_2d = subtensor %input_2d[%i, %k][2, 8][1, 1]
+      : tensor<16x32xf32> to tensor<2x8xf32>
+    %sub_1d = subtensor %input_1d[%j] [4] [1]
+      : tensor<24xf32> to tensor<4xf32>
+    %sub_out = subtensor %output[%j] [4] [1]
+      : tensor<24xf32> to tensor<4xf32>
+    %acc = linalg.generic #trait
+      ins(%sub_3d, %sub_2d, %sub_1d
+        : tensor<2x4x8xf32>, tensor<2x8xf32>, tensor<4xf32>)
+      outs(%sub_out : tensor<4xf32>)  {
+    ^bb0(%i3d: f32, %i2d: f32, %i1d: f32, %o: f32):
+      %0 = addf %i3d, %i2d : f32
+      %1 = addf %0, %i1d : f32
+      linalg.yield %1 : f32
+    } -> tensor<4xf32>
+
+    %sum_sub = subtensor_insert %acc into %output[%j][%c4][1]
+      : tensor<4xf32> into tensor<24xf32>
+    linalg.yield %sum_sub : tensor<24xf32>
+  }
+  return %result : tensor<24xf32>
+}
+// CHECK-LABEL: func @tiled_loop_reduction
+// CHECK: iterators(
-- 
2.7.4