From dfd98764f7a44987a06659e327cd0b66ce6e2df3 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Tue, 9 Apr 2019 01:24:35 -0700
Subject: [PATCH]     Start a Linalg doc

--

PiperOrigin-RevId: 242622278
---
 mlir/g3doc/Tutorials/Linalg/Ch-1.md                | 255 +++++++++++++++++++++
 mlir/g3doc/Tutorials/Linalg/Ch-2.md                |  98 ++++++++
 mlir/g3doc/Tutorials/Linalg/DeclarativeBuilders.md | 128 +++++++++++
 3 files changed, 481 insertions(+)
 create mode 100644 mlir/g3doc/Tutorials/Linalg/Ch-1.md
 create mode 100644 mlir/g3doc/Tutorials/Linalg/Ch-2.md
 create mode 100644 mlir/g3doc/Tutorials/Linalg/DeclarativeBuilders.md

diff --git a/mlir/g3doc/Tutorials/Linalg/Ch-1.md b/mlir/g3doc/Tutorials/Linalg/Ch-1.md
new file mode 100644
index 0000000..86fbc43
--- /dev/null
+++ b/mlir/g3doc/Tutorials/Linalg/Ch-1.md
@@ -0,0 +1,255 @@
+# Linalg Dialect
+
+This chapter describes the design and implementation of a simple linear algebra
+dialect in MLIR. The objective of the `linalg` dialect is to demonstrate that
+the MLIR infrastructure is a great fit for implementing high-level operations
+and lower them gradually to LLVM by reusing existing components and lowering
+paths. In particular, `linalg` is built upon the type system of the
+[`affine`](../../Dialects/Affine.md) dialect, which allows partial lowering to
+be implemented with relative ease.
+
+The `linalg` dialect is introduced gradually following this outline:
+
+1.  Type system and type-building operations.
+2.  Compute operations.
+3.  Lowerings between the `linalg` operations into `linalg` + `affine`
+    operations.
+4.  Tiling transformations.
+5.  A simple tiling and fusion transformation.
+
+The Toy language tutorial already introduced core MLIR concepts and best
+practices, the `linalg` dialect operates mostly at the level of the C++ API and
+in particular makes use of [declarative builders](DeclarativeBuilders.md), for
+terser IR emitting expressions. Without loss of generality, anything in this
+section can also be implemented with `mlir::Builder` and enough
+`getInsertionPoint` and `setInsertionPoint` manipulations.
+
+The implementation follows a few conventions to decouple, at each step, the
+newly introduced concepts and code from ones introduced previously without
+duplicating the whole code base in each directory. The code for concepts
+introduced at a particular step `k` live in the `Linalgk/include/linalgk` and
+`Linalgk/lib` directories and is linked into the `Linalgk` library.
+
+Lastly, note that simplifying assumptions are made to cut down on boilerplate
+and help focus on the core concepts. In particular, parsing the linalg dialect
+is currently not supported as it is used as an intermediary dialect. This does
+not impact the ability to lower all the way to LLVM with proper verified IR at
+each step of the lowering, or to execute the compiled binary.
+
+# Linalg Part 1: Type system
+
+We first describe the `linalg` type system.
+
+## RangeType and RangeOp
+
+A
+[RangeType](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/include/linalg1/RangeType.h)
+is a simple triple of `index` values. It represents a minimal range abstraction
+`(min, max, step)`. `RangeType` is a fully defined type and is constructed
+without any additional type argument. Its implementation illustrates the minimal
+amount of information required to implement a new custom MLIR type.
+
+```
+class RangeType : public mlir::Type::TypeBase<RangeType, mlir::Type> {
+public:
+  // Used to implement llvm-style cast.
+  using Base::Base;
+  /// Construction hook.
+  static RangeType get(mlir::MLIRContext *context) {
+    /// Custom, uniqu'ed construction in the mlir::MLIRContext.
+    return Base::get(context, LinalgTypes::Range);
+  }
+  /// Used to implement llvm-style cast.
+  static bool kindof(unsigned kind) { return kind == LinalgTypes::Range; }
+};
+```
+
+Unlike more complex types, RangeType does not require a hashing key for
+unique'ing in the `MLIRContext`. Note that all MLIR types derive from
+`mlir::Type::TypeBase` and expose `using Base::Base` to enable generic hooks to
+work properly (in this instance for llvm-style casts. RangeType does not even
+require an implementation file as the above represents the whole code for the
+type.
+
+The `linalg` dialect type `RangeType` pretty-prints simply as `!linalg.range`.
+
+A `linalg::RangeOp`, defined
+[here](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/include/linalg1/RangeOp.h),
+is the operation that produces ssa-values of `RangeType`. It pretty-prints as
+
+```
+  %0 = linalg.range %min, %max, %range : !linalg.range
+```
+
+The implementation of the `RangeOp::build` method and `RangeOp::verify`
+[methods](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/lib/RangeOp.cpp)
+are straightforward.
+
+A RangeType is used throughout to step over iteration domains (i.e. loop
+iterations via loop bounds and steps) as well as over the view data abstraction.
+A `LoopNestRangeBuilder` helper class is
+[introduced](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/include/linalg1/Common.h)
+to allow emission of loop nests from an `llvm::ArrayRef<mlir::Value*>` where
+each `mlir::Value` is a `linalg.range`.
+
+### Simplifying assumption
+
+The `linalg.range` type is generally unrestricted beyond havind elements of
+`index` type. however it is used to build loop nests using the `affine.for`
+[operation](../../Dialects/Affine.md) whose restrictions it inherits, at the
+point where `affine.for` operations are materialized. This is a tradeoff to
+reuse existing MLIR operations that are already known to lower to LLVM. As a
+consequence, the `step` in a `linalg.range` must be a static constant and cannot
+be symbolic.
+
+## ViewType and ViewOp
+
+A
+[ViewType](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/include/linalg1/ViewType.h)
+represents a multi-dimensional range abstraction to iterate over an underlying
+storage type. It is backed by a data type, in our case objects of
+[MemRefType](https://github.com/tensorflow/mlir/blob/master/include/mlir/IR/StandardTypes.h).
+A ViewType is a parameterized type which has a base element type and a rank. It
+is thus slightly more complex than RangeType and requires unique'ing in the
+enclosing MLIRContext.
+
+This is materialized by the existence of a storage type and a `hashKey` in the
+implementation
+[file](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/lib/ViewType.cpp).
+
+```
+struct ViewTypeStorage : public mlir::TypeStorage {
+  /// Underlying Key type to transport the payload needed to construct a custom
+  /// type in a generic way.
+  struct Key {
+    Key(Type elementType, unsigned rank)
+        : elementType(elementType), rank(rank) {}
+    Type elementType;
+    unsigned rank;
+  };
+  ...
+};
+```
+
+The `ViewTypeStorage` is not visible outside of the `ViewType` implementation
+and is referred to from `ViewType` as such: `class ViewType : public
+mlir::Type::TypeBase<ViewType, mlir::Type, ViewTypeStorage> { ... }`
+
+A two dimensional ViewType over a f32 storage pretty-prints as `view<?x?xf32>`.
+
+A `linalg::ViewOp`, defined
+[here](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/lib/ViewOp.cpp),
+is the operation that produces ssa-values of `ViewType` from an ssa-value of
+type `MemRefType`. A ViewOp has operands called "indexings" which can be either
+of `index` or `!linalg.range` type. The rationale is that `index` reduces the
+rank of a ViewType by 1 while a `!linalg.range` keeps the rank unchanged. This
+behavior is a convention that we have found useful during the implementation in
+order to fold chains of slice operations (introduced in the following paragraph)
+and capture enough information in the ViewOp so it can be lowered to LLVM.
+
+The entry point to the builder is the method: `static void
+ViewOp::build(mlir::Builder *b, mlir::OperationState *result, mlir::Value
+*memRef, llvm::ArrayRef<mlir::Value *> indexings = {});`
+
+A `ViewOp` pretty-prints as: `%1 = linalg.view %0[%m, %n, %k] :
+!linalg.view<?x?xf32>`
+
+This signifies that `%0` is a three dimensional `MemRef` of `f32` elemental type
+and that the `%1` view uses an `index` into one of the dimensions and two
+`!linalg.range` for the two other dimensions.
+
+The implementation of the `ViewOp::build` and `ViewOp::verify`
+[methods](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/lib/ViewOp.cpp)
+are simple.
+
+### Simplifying assumption
+
+We choose to reuse the existing MLIR
+`MemRef`[type](https://github.com/tensorflow/mlir/blob/master/include/mlir/IR/StandardTypes.h)
+as the underlying data structure. This avoids the need to redefine a new
+abstraction and simplifies lowering all the way to LLVM.
+
+## SliceOp
+
+A slice is a subview that is fully contained within its parent view and is
+constructed using a `SliceOp`. A SliceOp takes an ssa-value of type
+`linalg.view` and an "indexing" to produce a new `linalg.view` of rank:
+
+1.  Equal to the rank of the original view, if the indexing is a
+    `!linalg.range`.
+2.  Equal to the rank of the original view minus one, if the indexing is an
+    `index`.
+
+A slice op has an integer attribute which specifies the dimension of the parent
+view it slices and pretty-prints as:
+
+```
+%2 = linalg.slice %1[*, *, %0, *] : !linalg.view<?x?x?xf32>
+```
+
+In this particular case, %2 slices dimension `2` of the four dimensional view
+%1. The returned `!linalg.view<?x?x?xf32>` indicates that the indexing is
+rank-reducing and that %0 is an `index`.
+
+The implementation of the `SliceOp::build` and `SliceOp::verify`
+[methods](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/lib/SliceOp.cpp)
+are simple.
+
+### Simplifying assumption
+
+In this tutorial we do not enforce the strict subview property or perform bounds
+check analysis and instead assume that the code is correct by construction.
+
+## Notable remarks
+
+The declaration for the classes implementing the operations we described have
+common traits that enable certain API shortcuts and other behaviors. For
+instance, the `mlir::OpTrait::OneResult` makes the `getResult()` method
+available to the class.
+
+```
+
+class RangeOp : public mlir::Op<RangeOp, mlir::OpTrait::NOperands<3>::Impl,
+                                mlir::OpTrait::OneResult,
+                                mlir::OpTrait::HasNoSideEffect> { ... };
+
+class ViewOp : public mlir::Op<ViewOp, mlir::OpTrait::VariadicOperands,
+                               mlir::OpTrait::OneResult,
+                               mlir::OpTrait::HasNoSideEffect> { ... } ;
+
+class SliceOp : public mlir::Op<SliceOp, mlir::OpTrait::NOperands<2>::Impl,
+                                mlir::OpTrait::OneResult,
+                                mlir::OpTrait::HasNoSideEffect> { ... };
+```
+
+One particular trait of interest is `mlir::OpTrait::HasNoSideEffect` which
+enables constant folding and dead code elimination in the `canonicalizerPass`.
+
+## Dialect Registration
+
+Similarly to Toy, the dialect must be registered so that the pretty-printer and
+verifier can be enabled. Without registration, only the custom op form can be
+printed. Beware of ops printed in custom op form, when a short-hand form exists,
+because there is a high chance the IR verification is not enabled.
+
+To register the Linalg dialect, call
+`mlir::registerDialect<linalg::LinalgDialect>();`.
+
+### Note on code organization
+
+Registration occurs by constructing a new `LinalgDialect` which registers the
+proper types and ops at construction time, with sanity checks guarding against
+multiple registrations of the same symbols. At that point, the constructor needs
+to be statically aware of all the types and ops. Since our code structure
+chooses to isolate independent portions of the tutorial, and certain ops are
+introduced in later parts, we explicitly separate `DialectConstruction.cpp` in
+its separate library. Linking with the proper library enables the types that
+have been declared so far.
+
+## Putting it all together
+
+The
+[example](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg1/Example.cpp)
+demonstrates how to construct some simple IR snippets that pass through the
+verifier checks. We introduce a custom op called `some_consumer` to ensure that
+dead-code elimination does not optimize these simple examples out of existence.
diff --git a/mlir/g3doc/Tutorials/Linalg/Ch-2.md b/mlir/g3doc/Tutorials/Linalg/Ch-2.md
new file mode 100644
index 0000000..31af83e
--- /dev/null
+++ b/mlir/g3doc/Tutorials/Linalg/Ch-2.md
@@ -0,0 +1,98 @@
+# Linalg Part 2: Compute Operations
+
+We now describe the main compute operations `linalg.dot`, `linalg.matvec` and
+`linalg.matmul`. These operations are a subset of a more general tensor
+contraction
+[class](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg2/include/linalg2/TensorOps.h)
+of operations. In this tutorial, we define a tensor contraction as a generic
+operation which:
+
+1.  Reads a `getNumInputs()` number of input ssa-values of ViewType.
+2.  Writes into a `getNumOutputs()` number of input ssa-values of ViewType.
+3.  Can be written in scalar loop form as a perfect loop nest with
+    `getNumParallelDims()` outermost loops with parallel semantics and
+    `getNumReductionDims()` innermost dimensions with reduction semantics.
+4.  Has a scalar form that is specific to each particular specialization.
+
+## Operation Definition
+
+In this section we do not discuss the specific properties of tensor contractions
+but only define the `linalg.dot`, `linalg.matvec` and `linalg.matmul` operations
+as opaque operations with side-effects (reads and writes into input and output
+views).
+
+These operations take input and output views of the proper rank as operands. For
+the purpose of illustration, assume all the elemental types are fixed to `f32`.
+The invariants checked by the op-specific
+[verify](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg2/lib/TensorOps.cpp)
+functions are:
+
+1.  `linalg.dot` reads two one-dimensional `view<?xf32>` and writes a
+    zero-dimensional `view<f32>` (i.e. a scalar).
+2.  `linalg.matvec` reads a two-dimensional `view<?x?xf32>` matrix and a one
+    dimensional `view<?xf32>` vector and writes a one-dimensional `view<?xf32>`.
+3.  `linalg.matmul` reads two two-dimensional `view<?x?xf32>` matrices and
+    writes a two-dimensional `view<?x?xf32>` matrix.
+
+Other operations on higher-order tensors can be defined and would behave
+similarly with respect to IR verification and interactions with ViewType
+operands. The generic form of verification and pretty-printing is defined on the
+`TensorContractionBase`
+[class](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg2/include/linalg2/TensorOps.h).
+
+Note that in order to give TensorContractionBase access to the mlir::Op in a
+generic fashion, we use a CRTP pattern where:
+
+```
+template <class ConcreteOp> class TensorContractionBase { ... };
+
+class DotOp : public TensorContractionBase<DotOp>,
+              public mlir::Op<DotOp, mlir::OpTrait::VariadicOperands,
+                              mlir::OpTrait::ZeroResult> { ... }
+```
+
+In turn, this allows the generic behavior of TensorContractionBase to be
+implemented once and reused across ops. The generic verify method is:
+
+```
+template <class ConcreteOp>
+mlir::LogicalResult linalg::TensorContractionBase<ConcreteOp>::verify() {
+  auto *concreteOp = static_cast<ConcreteOp *>(this)->getOperation();
+  if (getNumInputs() <= 0)
+    concreteOp->emitOpError("expected at least one input");
+  ...
+}
+```
+
+Each specialized operation then calls into the generic verification method
+before applying its own verification steps.
+
+```
+LogicalResult linalg::MatmulOp::verify() {
+  if (failed(TensorContractionBaseType::verify()))
+    return failure();
+  auto *A = getOperand(0), *B = getOperand(1), *C = getOperand(2);
+  unsigned index = 0;
+  for (auto *v : {A, B, C}) {
+    if (getViewRank(v) != 2)
+      return emitOpError("operand " + Twine(index++) + " must be of rank 2");
+  }
+  return success();
+}
+```
+
+Note that in a more future-proof design, it is considered a best practice for
+operations which share similarity in their behavior to be defined with Tablegen.
+
+All TensorContractionBase ops pretty-print similarly. In the case of
+`linalg.matmul` the pretty-printed form is: `linalg.matmul(%A, %B, %C) :
+view<?x?xf32>`
+
+## Putting it all together
+
+The
+[example](https://github.com/tensorflow/mlir/blob/master/examples/Linalg/Linalg2/Example.cpp)
+demonstrates how to construct some simple IR snippets that pass through the
+verifier checks. The example demonstrate how to allocate three memref buffers
+from `index` function arguments and use those buffers as backing data structures
+for views that get passed to
diff --git a/mlir/g3doc/Tutorials/Linalg/DeclarativeBuilders.md b/mlir/g3doc/Tutorials/Linalg/DeclarativeBuilders.md
new file mode 100644
index 0000000..f3ad518
--- /dev/null
+++ b/mlir/g3doc/Tutorials/Linalg/DeclarativeBuilders.md
@@ -0,0 +1,128 @@
+# Background: declarative builders API
+
+The main purpose of the declarative builders API is to provide an intuitive way
+of constructing MLIR programmatically. In the majority of cases, the IR we wish
+to construct exhibits structured control-flow. Declarative builders provide an
+API to make MLIR construction and manipulation very idiomatic, for the
+structured control-flow case, in C++.
+
+## ScopedContext
+
+`mlir::edsc::ScopedContext` provides an implicit thread-local context,
+supporting a simple declarative API with globally accessible builders. These
+declarative builders are available within the lifetime of a `ScopedContext`.
+
+## ValueHandle and IndexHandle
+
+`mlir::edsc::ValueHandle` and `mlir::edsc::IndexHandle` provide typed
+abstractions around an `mlir::Value*`. These abstractions are "delayed", in the
+sense that they allow separating declaration from definition. They may
+capture IR snippets, as they are built, for programmatic manipulation.
+Intuitive operators are provided to allow concise and idiomatic expressions.
+
+```c++
+ValueHandle zero = constant_index(0);
+IndexHandle i, j, k;
+```
+
+## Intrinsics
+
+`mlir::edsc::ValueBuilder` is a generic wrapper for the `mlir::Builder::create`
+method that operates on `ValueHandle` objects and return a single ValueHandle.
+For instructions that return no values or that return multiple values, the
+`mlir::edsc::InstructionBuilder` can be used. Named intrinsics are provided as
+syntactic sugar to further reduce boilerplate.
+
+```c++
+using load = ValueBuilder<LoadOp>;
+using store = InstructionBuilder<StoreOp>;
+```
+
+## LoopBuilder and LoopNestBuilder
+
+`mlir::edsc::LoopNestBuilder` provides an interface to allow writing concise and
+structured loop nests.
+
+```c++
+  ScopedContext scope(f.get());
+  ValueHandle i(indexType),
+              j(indexType),
+              lb(f->getArgument(0)),
+              ub(f->getArgument(1));
+  ValueHandle f7(constant_float(llvm::APFloat(7.0f), f32Type)),
+              f13(constant_float(llvm::APFloat(13.0f), f32Type)),
+              i7(constant_int(7, 32)),
+              i13(constant_int(13, 32));
+  LoopBuilder(&i, lb, ub, 3)({
+      lb * index_t(3) + ub,
+      lb + index_t(3),
+      LoopBuilder(&j, lb, ub, 2)({
+          ceilDiv(index_t(31) * floorDiv(i + j * index_t(3), index_t(32)),
+                  index_t(32)),
+          ((f7 + f13) / f7) % f13 - f7 * f13,
+          ((i7 + i13) / i7) % i13 - i7 * i13,
+      }),
+  });
+```
+
+## IndexedValue
+
+`mlir::edsc::IndexedValue` provides an index notation around load and store
+operations on abstract data types by overloading the C++ assignment and
+parenthesis operators. The relevant loads and stores are emitted as appropriate.
+
+## Putting it all together
+
+With declarative builders, it becomes fairly concise to build rank and
+type-agnostic custom operations even though MLIR does not yet have generic
+types. Here is what a definition of a general pointwise add looks in
+Tablegen with declarative builders.
+
+```c++
+def AddOp : Op<"x.add">,
+    Arguments<(ins Tensor:$A, Tensor:$B)>,
+    Results<(outs Tensor: $C)> {
+  code referenceImplementation = [{
+    auto ivs = IndexHandle::makeIndexHandles(view_A.rank());
+    auto pivs = IndexHandle::makePIndexHandles(ivs);
+    IndexedValue A(arg_A), B(arg_B), C(arg_C);
+    LoopNestBuilder(pivs, view_A.getLbs(), view_A.getUbs(), view_A.getSteps())({
+      C(ivs) = A(ivs) + B(ivs)
+    });
+  }];
+}
+```
+
+Depending on the function signature on which this emitter is called, the
+generated IR resembles the following, for a 4-D memref of `vector<4xi8>`:
+
+``` {.mlir}
+// CHECK-LABEL: func @t1(%lhs: memref<3x4x5x6xvector<4xi8>>, %rhs: memref<3x4x5x6xvector<4xi8>>, %result: memref<3x4x5x6xvector<4xi8>>) -> () {
+//       CHECK: for {{.*}} = 0 to 3 {
+//       CHECK:   for {{.*}} = 0 to 4 {
+//       CHECK:     for {{.*}} = 0 to 5 {
+//       CHECK:       for {{.*}}= 0 to 6 {
+//       CHECK:         {{.*}} = load %arg1[{{.*}}] : memref<3x4x5x6xvector<4xi8>>
+//       CHECK:         {{.*}} = load %arg0[{{.*}}] : memref<3x4x5x6xvector<4xi8>>
+//       CHECK:         {{.*}} = addi {{.*}} : vector<4xi8>
+//       CHECK:         store {{.*}}, %arg2[{{.*}}] : memref<3x4x5x6xvector<4xi8>>
+```
+
+or the following, for a 0-D `memref<f32>`:
+
+``` {.mlir}
+// CHECK-LABEL: func @t3(%lhs: memref<f32>, %rhs: memref<f32>, %result: memref<f32>) -> () {
+//       CHECK: {{.*}} = load %arg1[] : memref<f32>
+//       CHECK: {{.*}} = load %arg0[] : memref<f32>
+//       CHECK: {{.*}} = addf {{.*}}, {{.*}} : f32
+//       CHECK: store {{.*}}, %arg2[] : memref<f32>
+```
+
+Since the implementation of declarative builders is in C++, it is also available
+to program the IR with an embedded-DSL flavor directly integrated in MLIR. We
+make use of these properties in the tutorial.
+
+Spoiler: MLIR also provides Python bindings for these builders, and a
+full-fledged Python machine learning DSL with automatic differentiation
+targeting MLIR was built as an early research collaboration.
+
-- 
2.7.4