[mlir][VectorToGPU] Support transposed+broadcasted 2D MMA load

author Lei Zhang <antiagainst@google.com>

Thu, 15 Dec 2022 19:34:14 +0000 (19:34 +0000)

committer Lei Zhang <antiagainst@google.com>

Thu, 15 Dec 2022 19:34:32 +0000 (19:34 +0000)
author Lei Zhang <antiagainst@google.com>
Thu, 15 Dec 2022 19:34:14 +0000 (19:34 +0000)
committer Lei Zhang <antiagainst@google.com>
Thu, 15 Dec 2022 19:34:32 +0000 (19:34 +0000)
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp

index 836e82e..c0d093b 100644 (file)
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -95,18 +95,20 @@ static bool contractSupportsMMAMatrixType(vector::ContractionOp contract,
  // Return true if the given map represents a transposed matrix load,
  // i.e. (d0, d1, ...) -> (dn-1, dn-2).
  static bool isTransposeMatrixLoadMap(OpBuilder &b, AffineMap permutationMap) {
+  MLIRContext *ctx = b.getContext();
    auto nDim = permutationMap.getNumDims();
+  AffineExpr zero = b.getAffineConstantExpr(0);
    if (nDim < 2) {
      // Support transposed+broadcasted cases: affine_map<(d0) -> (d0, 0)>.
      AffineExpr dim0 = b.getAffineDimExpr(0);
-    AffineExpr zero = b.getAffineConstantExpr(0);
-    return permutationMap == AffineMap::get(1, 0, {dim0, zero}, b.getContext());
+    return permutationMap == AffineMap::get(1, 0, {dim0, zero}, ctx);
    }
  
    AffineExpr innerDim = b.getAffineDimExpr(nDim - 1);
    AffineExpr outerDim = b.getAffineDimExpr(nDim - 2);
-  return permutationMap ==
-         AffineMap::get(nDim, 0, {innerDim, outerDim}, b.getContext());
+  // Support both transposed and transposed+broadcasted cases.
+  return permutationMap == AffineMap::get(nDim, 0, {innerDim, outerDim}, ctx) ||
+         permutationMap == AffineMap::get(nDim, 0, {innerDim, zero}, ctx);
  }
  
  // Return the stide for the dimension 0 of |type| if it is a memref and has a
diff --git a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir

index 56a8599..b00d34f 100644 (file)
--- a/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
+++ b/mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir
@@ -190,13 +190,13 @@ func.func @matmul_transposed(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>,
    return
  }
  
-// CHECK-LABEL: func @matmul_transposed_broadcasted
+// CHECK-LABEL: func @matmul_transposed_broadcasted_1d
  //   CHECK-DAG:   %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}] {leadDimension = 0 : index, transpose} : memref<16xf16> -> !gpu.mma_matrix<16x16xf16, "AOp">
  //   CHECK-DAG:   %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}] {leadDimension = 0 : index} : memref<16xf16> -> !gpu.mma_matrix<16x16xf16, "BOp">
  //   CHECK-DAG:   %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
  //       CHECK:   %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp">
  //       CHECK:   gpu.subgroup_mma_store_matrix %[[D]], %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16>
-func.func @matmul_transposed_broadcasted(%arg0: memref<16xf16>, %arg1: memref<16xf16>, %arg2: memref<16x16xf16>) {
+func.func @matmul_transposed_broadcasted_1d(%arg0: memref<16xf16>, %arg1: memref<16xf16>, %arg2: memref<16x16xf16>) {
    %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16>
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f16
@@ -207,3 +207,21 @@ func.func @matmul_transposed_broadcasted(%arg0: memref<16xf16>, %arg1: memref<16
    vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>
    return
  }
+
+// CHECK-LABEL: func @matmul_transposed_broadcasted_2d
+//   CHECK-DAG:   %[[A:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}] {leadDimension = 0 : index, transpose} : memref<32x32xf16> -> !gpu.mma_matrix<16x16xf16, "AOp">
+//   CHECK-DAG:   %[[B:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}] {leadDimension = 0 : index} : memref<32x32xf16> -> !gpu.mma_matrix<16x16xf16, "BOp">
+//   CHECK-DAG:   %[[C:.+]] = gpu.subgroup_mma_load_matrix %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : memref<16x16xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
+//       CHECK:   %[[D:.+]] = gpu.subgroup_mma_compute %[[A]], %[[B]], %[[C]] : !gpu.mma_matrix<16x16xf16, "AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf16, "COp">
+//       CHECK:   gpu.subgroup_mma_store_matrix %[[D]], %{{.*}}[%{{.*}}, %{{.*}}] {leadDimension = 16 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<16x16xf16>
+func.func @matmul_transposed_broadcasted_2d(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>, %arg2: memref<16x16xf16>) {
+  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf16>
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f16
+  %A = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref<32x32xf16>, vector<16x16xf16>
+  %B = vector.transfer_read %arg1[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, 0)>} : memref<32x32xf16>, vector<16x16xf16>
+  %C = vector.transfer_read %arg2[%c0, %c0], %cst {in_bounds = [true, true]} : memref<16x16xf16>, vector<16x16xf16>
+  %D = vector.contract {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %A, %B, %C : vector<16x16xf16>, vector<16x16xf16> into vector<16x16xf16>
+  vector.transfer_write %D, %arg2[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf16>, memref<16x16xf16>
+  return
+}
author	Lei Zhang <antiagainst@google.com>
	Thu, 15 Dec 2022 19:34:14 +0000 (19:34 +0000)
committer	Lei Zhang <antiagainst@google.com>
	Thu, 15 Dec 2022 19:34:32 +0000 (19:34 +0000)
mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp		patch \| blob \| history
mlir/test/Conversion/VectorToGPU/vector-to-mma-ops.mlir		patch \| blob \| history