[mlir] Mode for explicitly controlling the fusion kind

author Sumesh Udayakumaran <sumesh.uk@gmail.com>

Sat, 25 Sep 2021 22:46:03 +0000 (01:46 +0300)

committer Sumesh Udayakumaran <sumesh.uk@gmail.com>

Mon, 27 Sep 2021 17:37:42 +0000 (20:37 +0300)
author Sumesh Udayakumaran <sumesh.uk@gmail.com>
Sat, 25 Sep 2021 22:46:03 +0000 (01:46 +0300)
committer Sumesh Udayakumaran <sumesh.uk@gmail.com>
Mon, 27 Sep 2021 17:37:42 +0000 (20:37 +0300)
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h

index eef88b5d7a48c66cd8c526c62f4dc378c1c7274a..3a5b9bcecbdab48503aaacd522ce14afb94cf1c5 100644 (file)
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -25,6 +25,10 @@ namespace mlir {
  class AffineForOp;
  class GreedyRewriteConfig;
  
+/// Fusion mode to attempt. The default mode `Greedy` does both
+/// producer-consumer and sibling fusion.
+enum FusionMode { Greedy, ProducerConsumer, Sibling };
+
  //===----------------------------------------------------------------------===//
  // Passes
  //===----------------------------------------------------------------------===//
@@ -72,13 +76,14 @@ createCanonicalizerPass(const GreedyRewriteConfig &config);
  /// Creates a pass to perform common sub expression elimination.
  std::unique_ptr<Pass> createCSEPass();
  
-/// Creates a loop fusion pass which fuses loops. Buffers of size less than or
-/// equal to `localBufSizeThreshold` are promoted to memory space
-/// `fastMemorySpace'.
+/// Creates a loop fusion pass which fuses loops according to type of fusion
+/// specified in `fusionMode`. Buffers of size less than or equal to
+/// `localBufSizeThreshold` are promoted to memory space `fastMemorySpace`.
  std::unique_ptr<OperationPass<FuncOp>>
  createLoopFusionPass(unsigned fastMemorySpace = 0,
                       uint64_t localBufSizeThreshold = 0,
-                     bool maximalFusion = false);
+                     bool maximalFusion = false,
+                     enum FusionMode fusionMode = FusionMode::Greedy);
  
  /// Creates a loop invariant code motion pass that hoists loop invariant
  /// instructions out of the loop.
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td

index 91af2a2c56a93151ac52656441a4db9b349826e7..360b98d87c15638e828108b6fbfb91eb9d1929e7 100644 (file)
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -136,7 +136,15 @@ def AffineLoopFusion : FunctionPass<"affine-loop-fusion"> {
                              "to fast memory space">,
      Option<"maximalFusion", "fusion-maximal", "bool", /*default=*/"false",
             "Enables maximal loop fusion">,
-  ];
+    Option<"affineFusionMode", "mode", "enum FusionMode",
+           "mlir::FusionMode::Greedy", "fusion mode to attempt",
+           "llvm::cl::values(clEnumValN(mlir::FusionMode::Greedy,"
+           " \"greedy\", \"Perform greedy (both producer-consumer and sibling)  fusion\"), "
+           "clEnumValN( mlir::FusionMode::ProducerConsumer, "
+           "\"producer\", \"Perform only producer-consumer fusion\"), "
+           "clEnumValN( mlir::FusionMode::Sibling, "
+           "\"sibling\", \"Perform only sibling fusion\"))">,
+    ];
    let dependentDialects = ["memref::MemRefDialect"];
  }
  
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp

index c19c887a593d6a13b1521d3d7d9df0994c9f40dd..6a456ea84b35023828bd030a4d917cdcda9b4ea8 100644 (file)
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@@ -49,10 +49,11 @@ namespace {
  struct LoopFusion : public AffineLoopFusionBase<LoopFusion> {
    LoopFusion() = default;
    LoopFusion(unsigned fastMemorySpace, uint64_t localBufSizeThresholdBytes,
-             bool maximalFusion) {
+             bool maximalFusion, enum FusionMode affineFusionMode) {
      this->fastMemorySpace = fastMemorySpace;
      this->localBufSizeThreshold = localBufSizeThresholdBytes / 1024;
      this->maximalFusion = maximalFusion;
+    this->affineFusionMode = affineFusionMode;
    }
  
    void runOnFunction() override;
@@ -62,9 +63,10 @@ struct LoopFusion : public AffineLoopFusionBase<LoopFusion> {
  
  std::unique_ptr<OperationPass<FuncOp>>
  mlir::createLoopFusionPass(unsigned fastMemorySpace,
-                           uint64_t localBufSizeThreshold, bool maximalFusion) {
+                           uint64_t localBufSizeThreshold, bool maximalFusion,
+                           enum FusionMode affineFusionMode) {
    return std::make_unique<LoopFusion>(fastMemorySpace, localBufSizeThreshold,
-                                      maximalFusion);
+                                      maximalFusion, affineFusionMode);
  }
  
  namespace {
@@ -1391,13 +1393,25 @@ public:
        worklist.push_back(node.id);
      }
    }
+  /// Run only sibling fusion on the `mdg`.
+  void runSiblingFusionOnly() {
+    fuseSiblingNodes();
+    eraseUnusedMemRefAllocations();
+  }
+
+  /// Run only producer/consumer fusion on the `mdg`.
+  void runProducerConsumerFusionOnly() {
+    fuseProducerConsumerNodes(
+        /*maxSrcUserCount=*/std::numeric_limits<unsigned>::max());
+    eraseUnusedMemRefAllocations();
+  }
  
    // Run the GreedyFusion pass.
    // *) First pass through the nodes fuses single-use producer nodes into their
    //    unique consumer.
    // *) Second pass fuses sibling nodes which share no dependence edges.
    // *) Third pass fuses any remaining producer nodes into their users.
-  void run() {
+  void runGreedyFusion() {
      // TODO: Run this repeatedly until a fixed-point is reached.
      fuseProducerConsumerNodes(/*maxSrcUserCount=*/1);
      fuseSiblingNodes();
@@ -1971,5 +1985,11 @@ void LoopFusion::runOnFunction() {
    unsigned localBufSizeThresholdBytes = localBufSizeThreshold * 1024;
    GreedyFusion fusion(&g, localBufSizeThresholdBytes, fastMemorySpaceOpt,
                        maximalFusion, computeToleranceThreshold);
-  fusion.run();
+
+  if (affineFusionMode == FusionMode::ProducerConsumer)
+    fusion.runProducerConsumerFusionOnly();
+  else if (affineFusionMode == FusionMode::Sibling)
+    fusion.runSiblingFusionOnly();
+  else
+    fusion.runGreedyFusion();
  }
diff --git a/mlir/lib/Transforms/PassDetail.h b/mlir/lib/Transforms/PassDetail.h

index 2cb0e12b1cf204409e67b73232e9851e36341a6e..c6a67e32511bc41bb7751910fd979ee93d2c625f 100644 (file)
--- a/mlir/lib/Transforms/PassDetail.h
+++ b/mlir/lib/Transforms/PassDetail.h
@@ -10,6 +10,7 @@
  #define TRANSFORMS_PASSDETAIL_H_
  
  #include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/Passes.h"
  
  namespace mlir {
  class AffineDialect;
diff --git a/mlir/test/Transforms/loop-fusion-4.mlir b/mlir/test/Transforms/loop-fusion-4.mlir

index 61fd4e3c777def91d9f1d7a19af81ea055a6fac4..15b345e3780b56519cc000daf80e1f973efb7b06 100644 (file)
--- a/mlir/test/Transforms/loop-fusion-4.mlir
+++ b/mlir/test/Transforms/loop-fusion-4.mlir
@@ -1,54 +1,13 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion -split-input-file | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal" -split-input-file | FileCheck %s --check-prefix=MAXIMAL
+// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="mode=producer" -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER
+// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal mode=sibling" -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL
  
-// Part I of fusion tests in  mlir/test/Transforms/loop-fusion.mlir. 
+// Part I of fusion tests in  mlir/test/Transforms/loop-fusion.mlir.
  // Part II of fusion tests in mlir/test/Transforms/loop-fusion-2.mlir
  // Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir
  
-// -----
-
-func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
-    %cst_0 = constant 0.000000e+00 : f32
-    %cst_1 = constant 1.000000e+00 : f32
-    affine.for %arg3 = 0 to 1 {
-      affine.for %arg4 = 0 to 64 {
-        %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
-          %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
-          %5 = addf %prevAccum, %4 : f32
-          affine.yield %5 : f32
-        }
-        %accum_dbl = addf %accum, %accum : f32
-        affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1>
-      }
-    }
-    affine.for %arg3 = 0 to 1 {
-      affine.for %arg4 = 0 to 64 {
-        // Following loop  trip count does not match the corresponding source trip count.
-        %accum = affine.for %arg5 = 0 to 32 iter_args (%prevAccum = %cst_1) -> f32 {
-          %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
-          %5 = mulf %prevAccum, %4 : f32
-          affine.yield %5 : f32
-        }
-        %accum_sqr = mulf %accum, %accum : f32
-        affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1>
-      }
-    }
-    return
-}
-// Test checks the loop structure is preserved after sibling fusion
-// since the destination loop and source loop trip counts do not
-// match.
-// MAXIMAL-LABEL:   func @reduce_add_non_maximal_f32_f32(
-// MAXIMAL:        %[[cst_0:.*]] = constant 0.000000e+00 : f32
-// MAXIMAL-NEXT:        %[[cst_1:.*]] = constant 1.000000e+00 : f32
-// MAXIMAL-NEXT:           affine.for %[[idx_0:.*]]= 0 to 1 {
-// MAXIMAL-NEXT:             affine.for %[[idx_1:.*]] = 0 to 64 {
-// MAXIMAL-NEXT:               %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
-// MAXIMAL-NEXT:                 %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {
-
  // Expects fusion of producer into consumer at depth 4 and subsequent removal of
  // source loop.
-// CHECK-LABEL: func @unflatten4d
+// PRODUCER-CONSUMER-LABEL: func @unflatten4d
  func @unflatten4d(%arg1: memref<7x8x9x10xf32>) {
    %m = memref.alloc() : memref<5040xf32>
    %cf7 = constant 7.0 : f32
@@ -75,18 +34,18 @@ func @unflatten4d(%arg1: memref<7x8x9x10xf32>) {
    return
  }
  
-// CHECK:        affine.for
-// CHECK-NEXT:     affine.for
-// CHECK-NEXT:       affine.for
-// CHECK-NEXT:         affine.for
-// CHECK-NOT:    affine.for
-// CHECK: return
+// PRODUCER-CONSUMER:        affine.for
+// PRODUCER-CONSUMER-NEXT:     affine.for
+// PRODUCER-CONSUMER-NEXT:       affine.for
+// PRODUCER-CONSUMER-NEXT:         affine.for
+// PRODUCER-CONSUMER-NOT:    affine.for
+// PRODUCER-CONSUMER: return
  
  // -----
  
  // Expects fusion of producer into consumer at depth 2 and subsequent removal of
  // source loop.
-// CHECK-LABEL: func @unflatten2d_with_transpose
+// PRODUCER-CONSUMER-LABEL: func @unflatten2d_with_transpose
  func @unflatten2d_with_transpose(%arg1: memref<8x7xf32>) {
    %m = memref.alloc() : memref<56xf32>
    %cf7 = constant 7.0 : f32
@@ -105,7 +64,48 @@ func @unflatten2d_with_transpose(%arg1: memref<8x7xf32>) {
    return
  }
  
-// CHECK:        affine.for
-// CHECK-NEXT:     affine.for
-// CHECK-NOT:    affine.for
-// CHECK: return
-\ No newline at end of file
+// PRODUCER-CONSUMER:        affine.for
+// PRODUCER-CONSUMER-NEXT:     affine.for
+// PRODUCER-CONSUMER-NOT:    affine.for
+// PRODUCER-CONSUMER: return
+
+// -----
+
+// SIBLING-MAXIMAL-LABEL:   func @reduce_add_non_maximal_f32_f32(
+func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
+    %cst_0 = constant 0.000000e+00 : f32
+    %cst_1 = constant 1.000000e+00 : f32
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 64 {
+        %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
+          %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
+          %5 = addf %prevAccum, %4 : f32
+          affine.yield %5 : f32
+        }
+        %accum_dbl = addf %accum, %accum : f32
+        affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1>
+      }
+    }
+    affine.for %arg3 = 0 to 1 {
+      affine.for %arg4 = 0 to 64 {
+        // Following loop  trip count does not match the corresponding source trip count.
+        %accum = affine.for %arg5 = 0 to 32 iter_args (%prevAccum = %cst_1) -> f32 {
+          %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
+          %5 = mulf %prevAccum, %4 : f32
+          affine.yield %5 : f32
+        }
+        %accum_sqr = mulf %accum, %accum : f32
+        affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1>
+      }
+    }
+    return
+}
+// Test checks the loop structure is preserved after sibling fusion
+// since the destination loop and source loop trip counts do not
+// match.
+// SIBLING-MAXIMAL:        %[[cst_0:.*]] = constant 0.000000e+00 : f32
+// SIBLING-MAXIMAL-NEXT:        %[[cst_1:.*]] = constant 1.000000e+00 : f32
+// SIBLING-MAXIMAL-NEXT:           affine.for %[[idx_0:.*]]= 0 to 1 {
+// SIBLING-MAXIMAL-NEXT:             affine.for %[[idx_1:.*]] = 0 to 64 {
+// SIBLING-MAXIMAL-NEXT:               %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
+// SIBLING-MAXIMAL-NEXT:                 %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {
+\ No newline at end of file
author	Sumesh Udayakumaran <sumesh.uk@gmail.com>
	Sat, 25 Sep 2021 22:46:03 +0000 (01:46 +0300)
committer	Sumesh Udayakumaran <sumesh.uk@gmail.com>
	Mon, 27 Sep 2021 17:37:42 +0000 (20:37 +0300)
mlir/include/mlir/Transforms/Passes.h		patch \| blob \| history
mlir/include/mlir/Transforms/Passes.td		patch \| blob \| history
mlir/lib/Transforms/LoopFusion.cpp		patch \| blob \| history
mlir/lib/Transforms/PassDetail.h		patch \| blob \| history
mlir/test/Transforms/loop-fusion-4.mlir		patch \| blob \| history