From 49ce40e9ab255588bf5093f468b30cbb6b99895b Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Thu, 9 Dec 2021 03:21:22 -0800 Subject: [PATCH] [mlir] AsyncParallelFor: align block size to be a multiple of inner loops iterations Depends On D115263 By aligning block size to inner loop iterations parallel_compute_fn LLVM can later unroll and vectorize some of the inner loops with small number of trip counts. Up to 2x speedup in multiple benchmarks. Reviewed By: bkramer Differential Revision: https://reviews.llvm.org/D115436 --- .../Dialect/Async/Transforms/AsyncParallelFor.cpp | 110 +++++++++++++++------ .../Async/async-parallel-for-compute-fn.mlir | 42 +++++++- .../Async/CPU/test-async-parallel-for-2d.mlir | 2 +- 3 files changed, 123 insertions(+), 31 deletions(-) diff --git a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp index a4cab9d..d05e6bc 100644 --- a/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp +++ b/mlir/lib/Dialect/Async/Transforms/AsyncParallelFor.cpp @@ -240,10 +240,9 @@ getParallelComputeFunctionType(scf::ParallelOp op, PatternRewriter &rewriter) { } // Create a parallel compute fuction from the parallel operation. -static ParallelComputeFunction -createParallelComputeFunction(scf::ParallelOp op, - ParallelComputeFunctionBounds bounds, - PatternRewriter &rewriter) { +static ParallelComputeFunction createParallelComputeFunction( + scf::ParallelOp op, ParallelComputeFunctionBounds bounds, + unsigned numBlockAlignedInnerLoops, PatternRewriter &rewriter) { OpBuilder::InsertionGuard guard(rewriter); ImplicitLocOpBuilder b(op.getLoc(), rewriter); @@ -384,17 +383,26 @@ createParallelComputeFunction(scf::ParallelOp op, // Keep building loop nest. if (loopIdx < op.getNumLoops() - 1) { - // Select nested loop lower/upper bounds depending on our position in - // the multi-dimensional iteration space. - auto lb = nb.create(isBlockFirstCoord[loopIdx], - blockFirstCoord[loopIdx + 1], c0); + if (loopIdx + 1 >= op.getNumLoops() - numBlockAlignedInnerLoops) { + // For block aligned loops we always iterate starting from 0 up to + // the loop trip counts. + nb.create(c0, tripCounts[loopIdx + 1], c1, ValueRange(), + workLoopBuilder(loopIdx + 1)); + + } else { + // Select nested loop lower/upper bounds depending on our position in + // the multi-dimensional iteration space. + auto lb = nb.create(isBlockFirstCoord[loopIdx], + blockFirstCoord[loopIdx + 1], c0); + + auto ub = nb.create(isBlockLastCoord[loopIdx], + blockEndCoord[loopIdx + 1], + tripCounts[loopIdx + 1]); + + nb.create(lb, ub, c1, ValueRange(), + workLoopBuilder(loopIdx + 1)); + } - auto ub = nb.create(isBlockLastCoord[loopIdx], - blockEndCoord[loopIdx + 1], - tripCounts[loopIdx + 1]); - - nb.create(lb, ub, c1, ValueRange(), - workLoopBuilder(loopIdx + 1)); nb.create(loc); return; } @@ -731,6 +739,46 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op, auto dispatch = [&](OpBuilder &nestedBuilder, Location loc) { ImplicitLocOpBuilder nb(loc, nestedBuilder); + // Collect statically known constants defining the loop nest in the parallel + // compute function. LLVM can't always push constants across the non-trivial + // async dispatch call graph, by providing these values explicitly we can + // choose to build more efficient loop nest, and rely on a better constant + // folding, loop unrolling and vectorization. + ParallelComputeFunctionBounds staticBounds = { + integerConstants(tripCounts), + integerConstants(op.lowerBound()), + integerConstants(op.upperBound()), + integerConstants(op.step()), + }; + + // Find how many inner iteration dimensions are statically known, and their + // product is smaller than the `512`. We aling the parallel compute block + // size by the product of statically known dimensions, so that we can + // guarantee that the inner loops executes from 0 to the loop trip counts + // and we can elide dynamic loop boundaries, and give LLVM an opportunity to + // unroll the loops. The constant `512` is arbitrary, it should depend on + // how many iterations LLVM will typically decide to unroll. + static constexpr int64_t maxIterations = 512; + + // The number of inner loops with statically known number of iterations less + // than the `maxIterations` value. + int numUnrollableLoops = 0; + + auto getInt = [](IntegerAttr attr) { return attr ? attr.getInt() : 0; }; + + SmallVector numIterations(op.getNumLoops()); + numIterations.back() = getInt(staticBounds.tripCounts.back()); + + for (int i = op.getNumLoops() - 2; i >= 0; --i) { + int64_t tripCount = getInt(staticBounds.tripCounts[i]); + int64_t innerIterations = numIterations[i + 1]; + numIterations[i] = tripCount * innerIterations; + + // Update the number of inner loops that we can potentially unroll. + if (innerIterations > 0 && innerIterations <= maxIterations) + numUnrollableLoops++; + } + // With large number of threads the value of creating many compute blocks // is reduced because the problem typically becomes memory bound. For small // number of threads it helps with stragglers. @@ -755,24 +803,28 @@ AsyncParallelForRewrite::matchAndRewrite(scf::ParallelOp op, Value bs0 = b.create(tripCount, maxComputeBlocks); Value bs1 = b.create(bs0, minTaskSizeCst); Value blockSize = b.create(tripCount, bs1); - Value blockCount = b.create(tripCount, blockSize); - // Collect statically known constants defining the loop nest in the parallel - // compute function. LLVM can't always push constants across the non-trivial - // async dispatch call graph, by providing these values explicitly we can - // choose to build more efficient loop nest, and rely on a better constant - // folding, loop unrolling and vectorization. - ParallelComputeFunctionBounds staticBounds = { - integerConstants(tripCounts), - integerConstants(op.lowerBound()), - integerConstants(op.upperBound()), - integerConstants(op.step()), - }; + // Align the block size to be a multiple of the statically known number + // of iterations in the inner loops. + if (numUnrollableLoops > 0 && minTaskSize >= maxIterations) { + Value numIters = b.create( + numIterations[op.getNumLoops() - numUnrollableLoops]); + Value bs2 = b.create( + b.create(blockSize, numIters), numIters); + blockSize = b.create(tripCount, bs2); + } else { + // Reset the number of unrollable loops if we didn't align the block size. + numUnrollableLoops = 0; + } + + // Compute the number of parallel compute blocks. + Value blockCount = b.create(tripCount, blockSize); - // Create a parallel compute function that takes a block id and computes the - // parallel operation body for a subset of iteration space. + // Create a parallel compute function that takes a block id and computes + // the parallel operation body for a subset of iteration space. ParallelComputeFunction parallelComputeFunction = - createParallelComputeFunction(op, staticBounds, rewriter); + createParallelComputeFunction(op, staticBounds, numUnrollableLoops, + rewriter); // Dispatch parallel compute function using async recursive work splitting, // or by submitting compute task sequentially from a caller thread. diff --git a/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir b/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir index f95715d..591f770 100644 --- a/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir +++ b/mlir/test/Dialect/Async/async-parallel-for-compute-fn.mlir @@ -65,4 +65,44 @@ func @sink_constant_step(%arg0: memref, %lb: index, %ub: index) { // CHECK: scf.for %[[I:arg[0-9]+]] // CHECK: %[[TMP:.*]] = arith.muli %[[I]], %[[CSTEP]] // CHECK: %[[IDX:.*]] = arith.addi %[[LB]], %[[TMP]] -// CHECK: memref.store \ No newline at end of file +// CHECK: memref.store + +// ----- + +// Check that for statically known inner loop bound block size is aligned and +// inner loop uses statically known loop trip counts. + +// CHECK-LABEL: func @sink_constant_step( +func @sink_constant_step(%arg0: memref, %lb: index, %ub: index) { + %one = arith.constant 1.0 : f32 + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c10 = arith.constant 10 : index + + scf.parallel (%i, %j) = (%lb, %c0) to (%ub, %c10) step (%c1, %c1) { + memref.store %one, %arg0[%i, %j] : memref + } + + return +} + +// CHECK-LABEL: func private @parallel_compute_fn( +// CHECK-SAME: %[[BLOCK_INDEX:arg[0-9]+]]: index, +// CHECK-SAME: %[[BLOCK_SIZE:arg[0-9]+]]: index, +// CHECK-SAME: %[[TRIP_COUNT0:arg[0-9]+]]: index, +// CHECK-SAME: %[[TRIP_COUNT1:arg[0-9]+]]: index, +// CHECK-SAME: %[[LB0:arg[0-9]+]]: index, +// CHECK-SAME: %[[LB1:arg[0-9]+]]: index, +// CHECK-SAME: %[[UB0:arg[0-9]+]]: index, +// CHECK-SAME: %[[UB1:arg[0-9]+]]: index, +// CHECK-SAME: %[[STEP0:arg[0-9]+]]: index, +// CHECK-SAME: %[[STEP1:arg[0-9]+]]: index, +// CHECK-SAME: %[[MEMREF:arg[0-9]+]]: memref +// CHECK-SAME: ) { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C10:.*]] = arith.constant 10 : index +// CHECK: scf.for %[[I:arg[0-9]+]] +// CHECK-NOT: select +// CHECK: scf.for %[[J:arg[0-9]+]] = %c0 to %c10 step %c1 \ No newline at end of file diff --git a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir index b92dfba..4447473 100644 --- a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir +++ b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir @@ -31,7 +31,7 @@ // RUN: mlir-opt %s -async-parallel-for="async-dispatch=false \ // RUN: num-workers=20 \ -// RUN: min-task-size=1" \ +// RUN: min-task-size=1" \ // RUN: -async-to-async-runtime \ // RUN: -async-runtime-ref-counting \ // RUN: -async-runtime-ref-counting-opt \ -- 2.7.4