From 49ce40e9ab255588bf5093f468b30cbb6b99895b Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 9 Dec 2021 03:21:22 -0800
Subject: [PATCH] [mlir] AsyncParallelFor: align block size to be a multiple of
 inner loops iterations

Depends On D115263

By aligning block size to inner loop iterations parallel_compute_fn LLVM can later unroll and vectorize some of the inner loops with small number of trip counts. Up to 2x speedup in multiple benchmarks.

Reviewed By: bkramer

Differential Revision: https://reviews.llvm.org/D115436
---
 .../Dialect/Async/Transforms/AsyncParallelFor.cpp  | 110 +++++++++++++++------
 .../Async/async-parallel-for-compute-fn.mlir       |  42 +++++++-
 .../Async/CPU/test-async-parallel-for-2d.mlir      |   2 +-
 3 files changed, 123 insertions(+), 31 deletions(-)
diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
index a4cab9d..d05e6bc 100644
--- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
+++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp
@@ -240,10 +240,9 @@ getParallelComputeFunctionType(scf::ParallelOp op, PatternRewriter &rewriter) {
 }
 
 // Create a parallel compute fuction from the parallel operation.
-static ParallelComputeFunction
-createParallelComputeFunction(scf::ParallelOp op,
-                              ParallelComputeFunctionBounds bounds,
-                              PatternRewriter &rewriter) {
+static ParallelComputeFunction createParallelComputeFunction(
+    scf::ParallelOp op, ParallelComputeFunctionBounds bounds,
+    unsigned numBlockAlignedInnerLoops, PatternRewriter &rewriter) {
   OpBuilder::InsertionGuard guard(rewriter);
   ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
@@ -384,17 +383,26 @@ createParallelComputeFunction(scf::ParallelOp op,
 
       // Keep building loop nest.
       if (loopIdx < op.getNumLoops() - 1) {
-        // Select nested loop lower/upper bounds depending on our position in
-        // the multi-dimensional iteration space.
-        auto lb = nb.create<SelectOp>(isBlockFirstCoord[loopIdx],
-                                      blockFirstCoord[loopIdx + 1], c0);
+        if (loopIdx + 1 >= op.getNumLoops() - numBlockAlignedInnerLoops) {
+          // For block aligned loops we always iterate starting from 0 up to
+          // the loop trip counts.
+          nb.create<scf::ForOp>(c0, tripCounts[loopIdx + 1], c1, ValueRange(),
+                                workLoopBuilder(loopIdx + 1));
+
+        } else {
+          // Select nested loop lower/upper bounds depending on our position in
+          // the multi-dimensional iteration space.
+          auto lb = nb.create<SelectOp>(isBlockFirstCoord[loopIdx],
+                                        blockFirstCoord[loopIdx + 1], c0);
+
+          auto ub = nb.create<SelectOp>(isBlockLastCoord[loopIdx],
+                                        blockEndCoord[loopIdx + 1],
+                                        tripCounts[loopIdx + 1]);
+
+          nb.create<scf::ForOp>(lb, ub, c1, ValueRange(),
+                                workLoopBuilder(loopIdx + 1));
+        }
 
-        auto ub = nb.create<SelectOp>(isBlockLastCoord[loopIdx],
-                                      blockEndCoord[loopIdx + 1],
-                                      tripCounts[loopIdx + 1]);
-
-        nb.create<scf::ForOp>(lb, ub, c1, ValueRange(),
-                              workLoopBuilder(loopIdx + 1));
         nb.create<scf::YieldOp>(loc);
         return;
       }
@@ -731,6 +739,46 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
   auto dispatch = [&](OpBuilder &nestedBuilder, Location loc) {
     ImplicitLocOpBuilder nb(loc, nestedBuilder);
 
+    // Collect statically known constants defining the loop nest in the parallel
+    // compute function. LLVM can't always push constants across the non-trivial
+    // async dispatch call graph, by providing these values explicitly we can
+    // choose to build more efficient loop nest, and rely on a better constant
+    // folding, loop unrolling and vectorization.
+    ParallelComputeFunctionBounds staticBounds = {
+        integerConstants(tripCounts),
+        integerConstants(op.lowerBound()),
+        integerConstants(op.upperBound()),
+        integerConstants(op.step()),
+    };
+
+    // Find how many inner iteration dimensions are statically known, and their
+    // product is smaller than the `512`. We aling the parallel compute block
+    // size by the product of statically known dimensions, so that we can
+    // guarantee that the inner loops executes from 0 to the loop trip counts
+    // and we can elide dynamic loop boundaries, and give LLVM an opportunity to
+    // unroll the loops. The constant `512` is arbitrary, it should depend on
+    // how many iterations LLVM will typically decide to unroll.
+    static constexpr int64_t maxIterations = 512;
+
+    // The number of inner loops with statically known number of iterations less
+    // than the `maxIterations` value.
+    int numUnrollableLoops = 0;
+
+    auto getInt = [](IntegerAttr attr) { return attr ? attr.getInt() : 0; };
+
+    SmallVector<int64_t> numIterations(op.getNumLoops());
+    numIterations.back() = getInt(staticBounds.tripCounts.back());
+
+    for (int i = op.getNumLoops() - 2; i >= 0; --i) {
+      int64_t tripCount = getInt(staticBounds.tripCounts[i]);
+      int64_t innerIterations = numIterations[i + 1];
+      numIterations[i] = tripCount * innerIterations;
+
+      // Update the number of inner loops that we can potentially unroll.
+      if (innerIterations > 0 && innerIterations <= maxIterations)
+        numUnrollableLoops++;
+    }
+
     // With large number of threads the value of creating many compute blocks
     // is reduced because the problem typically becomes memory bound. For small
     // number of threads it helps with stragglers.
@@ -755,24 +803,28 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op,
     Value bs0 = b.create<arith::CeilDivSIOp>(tripCount, maxComputeBlocks);
     Value bs1 = b.create<arith::MaxSIOp>(bs0, minTaskSizeCst);
     Value blockSize = b.create<arith::MinSIOp>(tripCount, bs1);
-    Value blockCount = b.create<arith::CeilDivSIOp>(tripCount, blockSize);
 
-    // Collect statically known constants defining the loop nest in the parallel
-    // compute function. LLVM can't always push constants across the non-trivial
-    // async dispatch call graph, by providing these values explicitly we can
-    // choose to build more efficient loop nest, and rely on a better constant
-    // folding, loop unrolling and vectorization.
-    ParallelComputeFunctionBounds staticBounds = {
-        integerConstants(tripCounts),
-        integerConstants(op.lowerBound()),
-        integerConstants(op.upperBound()),
-        integerConstants(op.step()),
-    };
+    // Align the block size to be a multiple of the statically known number
+    // of iterations in the inner loops.
+    if (numUnrollableLoops > 0 && minTaskSize >= maxIterations) {
+      Value numIters = b.create<arith::ConstantIndexOp>(
+          numIterations[op.getNumLoops() - numUnrollableLoops]);
+      Value bs2 = b.create<arith::MulIOp>(
+          b.create<arith::CeilDivSIOp>(blockSize, numIters), numIters);
+      blockSize = b.create<arith::MinSIOp>(tripCount, bs2);
+    } else {
+      // Reset the number of unrollable loops if we didn't align the block size.
+      numUnrollableLoops = 0;
+    }
+
+    // Compute the number of parallel compute blocks.
+    Value blockCount = b.create<arith::CeilDivSIOp>(tripCount, blockSize);
 
-    // Create a parallel compute function that takes a block id and computes the
-    // parallel operation body for a subset of iteration space.
+    // Create a parallel compute function that takes a block id and computes
+    // the parallel operation body for a subset of iteration space.
     ParallelComputeFunction parallelComputeFunction =
-        createParallelComputeFunction(op, staticBounds, rewriter);
+        createParallelComputeFunction(op, staticBounds, numUnrollableLoops,
+                                      rewriter);
 
     // Dispatch parallel compute function using async recursive work splitting,
     // or by submitting compute task sequentially from a caller thread.
diff --git a/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir b/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir
index f95715d..591f770 100644
--- a/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir
+++ b/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir
@@ -65,4 +65,44 @@ func @sink_constant_step(%arg0: memref<?xf32>, %lb: index, %ub: index) {
 // CHECK:        scf.for %[[I:arg[0-9]+]]
 // CHECK:          %[[TMP:.*]] = arith.muli %[[I]], %[[CSTEP]]
 // CHECK:          %[[IDX:.*]] = arith.addi %[[LB]], %[[TMP]]
-// CHECK:          memref.store
\ No newline at end of file
+// CHECK:          memref.store
+
+// -----
+
+// Check that for statically known inner loop bound block size is aligned and
+// inner loop uses statically known loop trip counts.
+
+// CHECK-LABEL: func @sink_constant_step(
+func @sink_constant_step(%arg0: memref<?x10xf32>, %lb: index, %ub: index) {
+  %one = arith.constant 1.0 : f32
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+
+  scf.parallel (%i, %j) = (%lb, %c0) to (%ub, %c10) step (%c1, %c1) {
+    memref.store %one, %arg0[%i, %j] : memref<?x10xf32>
+  }
+
+  return
+}
+
+// CHECK-LABEL: func private @parallel_compute_fn(
+// CHECK-SAME:   %[[BLOCK_INDEX:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[BLOCK_SIZE:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[TRIP_COUNT0:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[TRIP_COUNT1:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[LB0:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[LB1:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[UB0:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[UB1:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[STEP0:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[STEP1:arg[0-9]+]]: index,
+// CHECK-SAME:   %[[MEMREF:arg[0-9]+]]: memref<?x10xf32>
+// CHECK-SAME: ) {
+// CHECK:        %[[C0:.*]] = arith.constant 0 : index
+// CHECK:        %[[C1:.*]] = arith.constant 1 : index
+// CHECK:        %[[C10:.*]] = arith.constant 10 : index
+// CHECK:        scf.for %[[I:arg[0-9]+]]
+// CHECK-NOT:      select
+// CHECK:          scf.for %[[J:arg[0-9]+]] = %c0 to %c10 step %c1
\ No newline at end of file
diff --git a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir
index b92dfba..4447473 100644
--- a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir
@@ -31,7 +31,7 @@
 
 // RUN:   mlir-opt %s -async-parallel-for="async-dispatch=false                \
 // RUN:                                    num-workers=20                      \
-// RUN:                                    min-task-size=1"                \
+// RUN:                                    min-task-size=1"                    \
 // RUN:               -async-to-async-runtime                                  \
 // RUN:               -async-runtime-ref-counting                              \
 // RUN:               -async-runtime-ref-counting-opt                          \
-- 
2.7.4