From 340314c4dcc801d8f493c45cafd79c79c6e8e58e Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Mon, 13 Sep 2021 12:08:54 -0700
Subject: [PATCH] Reorder mmt4d shapes:

* Revert https://reviews.llvm.org/D107307 so that both LHS and RHS have
  the same layout with K0 as the innermost dimension.

* Continuing from https://reviews.llvm.org/D107003, move also 'K'
  to the outer side, so that now the inter-tile dimensions as all outer,
  and the intra-tile dimensions are all inner.

Reviewed By: asaadaldien

Differential Revision: https://reviews.llvm.org/D109692
---
 .../mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml   | 14 +++++++-------
 .../mlir/dialects/linalg/opdsl/ops/core_named_ops.py       |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index ec71aa5..70c4a3c 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -181,7 +181,7 @@ structured_op: !LinalgStructuredOpConfig
     name: rhs
     usage: InputOperand
     type_var: RhsType
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4, s1, s3, s5)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4, s1, s5, s3)>
   - !LinalgOperandDefConfig
     name: accum
     usage: OutputOperand
@@ -189,19 +189,19 @@ structured_op: !LinalgStructuredOpConfig
     shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s4, s2, s5)>
   indexing_maps: !LinalgIndexingMapsConfig
     static_indexing_maps:
-    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d4, d2,
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d2, d3,
       d5)>
-    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d1, d4, d5,
-      d3)>
-    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d2,
-      d3)>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d1, d2, d4,
+      d5)>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5] -> (d0, d1, d3,
+      d4)>
   iterator_types:
   - parallel
   - parallel
+  - reduction
   - parallel
   - parallel
   - reduction
-  - reduction
   assignments:
   - !ScalarAssign
     arg: accum
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index 38db294..fc37a2e 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -39,7 +39,7 @@ def quantized_matmul(
 
 @linalg_structured_op
 def mmt4d(lhs=TensorDef(TV.LhsType, S.M, S.K, S.M0, S.K0),
-          rhs=TensorDef(TV.RhsType, S.N, S.K, S.K0, S.N0),
+          rhs=TensorDef(TV.RhsType, S.N, S.K, S.N0, S.K0),
           accum=TensorDef(TV.AccumType, S.M, S.N, S.M0, S.N0,
                                   output=True)):
   """Performs a matrix-matrix-transpose multiplication of two 4D inputs.
@@ -52,9 +52,9 @@ def mmt4d(lhs=TensorDef(TV.LhsType, S.M, S.K, S.M0, S.K0),
       '0' suffixes below, for instance the LHS matrix shape (M, K, M0, K0) reads
       as: MxK tiles, each of shape M0xK0.
   """
-  domain(D.m, D.n, D.m0, D.n0, D.k, D.k0)
+  domain(D.m, D.n, D.k, D.m0, D.n0, D.k0)
   implements(ContractionOpInterface)
-  accum[D.m, D.n, D.m0, D.n0] += cast(TV.AccumType, lhs[D.m, D.k, D.m0, D.k0]) * cast(TV.AccumType, rhs[D.n, D.k, D.k0, D.n0])
+  accum[D.m, D.n, D.m0, D.n0] += cast(TV.AccumType, lhs[D.m, D.k, D.m0, D.k0]) * cast(TV.AccumType, rhs[D.n, D.k, D.n0, D.k0])
 
 @linalg_structured_op
 def batch_matmul(
-- 
2.7.4