From dc930e5f2f91e7eb5ebc9cb61f6a71bc8924559e Mon Sep 17 00:00:00 2001 From: Navdeep Kumar Date: Mon, 7 Dec 2020 14:57:51 +0530 Subject: [PATCH] [MLIR][Affine] Add affine.for normalization support Add support to normalize affine.for ops i.e., convert the lower bound to zero and loop step to one. The Upper bound is set to the trip count of the loop. The exact value of loopIV is calculated just inside the body of affine.for. Currently loops with lower bounds having single result are supported. No such restriction exists on upper bounds. Differential Revision: https://reviews.llvm.org/D92233 --- .../Affine/Transforms/AffineLoopNormalize.cpp | 98 ++++++++++++- .../test/Dialect/Affine/affine-loop-normalize.mlir | 159 +++++++++++++++++++++ 2 files changed, 253 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp index 2ad403e..d863663 100644 --- a/mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopNormalize.cpp @@ -79,14 +79,104 @@ void mlir::normalizeAffineParallel(AffineParallelOp op) { op.setUpperBounds(ranges.getOperands(), newUpperMap); } -/// Normalization transformations for affine.for ops. For now, it only removes -/// single iteration loops. We may want to consider separating redundant loop -/// elimitation from loop bound normalization, if needed in the future. +/// Normalizes affine.for ops. If the affine.for op has only a single iteration +/// only then it is simply promoted, else it is normalized in the traditional +/// way, by converting the lower bound to zero and loop step to one. The upper +/// bound is set to the trip count of the loop. For now, original loops must +/// have lower bound with a single result only. There is no such restriction on +/// upper bounds. static void normalizeAffineFor(AffineForOp op) { if (succeeded(promoteIfSingleIteration(op))) return; - // TODO: Normalize loop bounds. + // Check if the forop is already normalized. + if (op.hasConstantLowerBound() && (op.getConstantLowerBound() == 0) && + (op.getStep() == 1)) + return; + + // Check if the lower bound has a single result only. Loops with a max lower + // bound can't be normalized without additional support like + // affine.execute_region's. If the lower bound does not have a single result + // then skip this op. + if (op.getLowerBoundMap().getNumResults() != 1) + return; + + Location loc = op.getLoc(); + OpBuilder opBuilder(op); + int64_t origLoopStep = op.getStep(); + + // Calculate upperBound for normalized loop. + SmallVector ubOperands; + AffineBound lb = op.getLowerBound(); + AffineBound ub = op.getUpperBound(); + ubOperands.reserve(ub.getNumOperands() + lb.getNumOperands()); + AffineMap origLbMap = lb.getMap(); + AffineMap origUbMap = ub.getMap(); + + // Add dimension operands from upper/lower bound. + for (unsigned j = 0, e = origUbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(ub.getOperand(j)); + for (unsigned j = 0, e = origLbMap.getNumDims(); j < e; ++j) + ubOperands.push_back(lb.getOperand(j)); + + // Add symbol operands from upper/lower bound. + for (unsigned j = 0, e = origUbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(ub.getOperand(origUbMap.getNumDims() + j)); + for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j) + ubOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j)); + + // Add original result expressions from lower/upper bound map. + SmallVector origLbExprs(origLbMap.getResults().begin(), + origLbMap.getResults().end()); + SmallVector origUbExprs(origUbMap.getResults().begin(), + origUbMap.getResults().end()); + SmallVector newUbExprs; + + // The original upperBound can have more than one result. For the new + // upperBound of this loop, take difference of all possible combinations of + // the ub results and lb result and ceildiv with the loop step. For e.g., + // + // affine.for %i1 = 0 to min affine_map<(d0)[] -> (d0 + 32, 1024)>(%i0) + // will have an upperBound map as, + // affine_map<(d0)[] -> (((d0 + 32) - 0) ceildiv 1, (1024 - 0) ceildiv + // 1)>(%i0) + // + // Insert all combinations of upper/lower bound results. + for (unsigned i = 0, e = origUbExprs.size(); i < e; ++i) { + newUbExprs.push_back( + (origUbExprs[i] - origLbExprs[0]).ceilDiv(origLoopStep)); + } + + // Construct newUbMap. + AffineMap newUbMap = + AffineMap::get(origLbMap.getNumDims() + origUbMap.getNumDims(), + origLbMap.getNumSymbols() + origUbMap.getNumSymbols(), + newUbExprs, opBuilder.getContext()); + + // Normalize the loop. + op.setUpperBound(ubOperands, newUbMap); + op.setLowerBound({}, opBuilder.getConstantAffineMap(0)); + op.setStep(1); + + // Calculate the Value of new loopIV. Create affine.apply for the value of + // the loopIV in normalized loop. + opBuilder.setInsertionPointToStart(op.getBody()); + SmallVector lbOperands(lb.getOperands().begin(), + lb.getOperands().begin() + + lb.getMap().getNumDims()); + // Add an extra dim operand for loopIV. + lbOperands.push_back(op.getInductionVar()); + // Add symbol operands from lower bound. + for (unsigned j = 0, e = origLbMap.getNumSymbols(); j < e; ++j) + lbOperands.push_back(lb.getOperand(origLbMap.getNumDims() + j)); + + AffineExpr origIVExpr = opBuilder.getAffineDimExpr(lb.getMap().getNumDims()); + AffineExpr newIVExpr = origIVExpr * origLoopStep + origLbMap.getResult(0); + AffineMap ivMap = AffineMap::get(origLbMap.getNumDims() + 1, + origLbMap.getNumSymbols(), newIVExpr); + Operation *newIV = opBuilder.create(loc, ivMap, lbOperands); + op.getInductionVar().replaceAllUsesExcept(newIV->getResult(0), + SmallPtrSet{newIV}); } namespace { diff --git a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir index ab8a0b7..e8ad3ed 100644 --- a/mlir/test/Dialect/Affine/affine-loop-normalize.mlir +++ b/mlir/test/Dialect/Affine/affine-loop-normalize.mlir @@ -42,3 +42,162 @@ func @single_iteration_loop(%in: memref<1xf32>, %out: memref<1xf32>) { // CHECK: affine.load // CHECK-NEXT: affine.store // CHECK-NEXT: return + +// ----- + +// CHECK-DAG: [[$IV0:#map[0-9]+]] = affine_map<(d0) -> (d0 * 2 + 2)> +// CHECK-DAG: [[$IV1:#map[0-9]+]] = affine_map<(d0) -> (d0 * 3)> + +// CHECK-LABEL: func @simple_loop_nest() +// CHECK-NEXT: affine.for %[[I:.*]] = 0 to 15 { +// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$IV0]](%[[I]]) +// CHECK-NEXT: affine.for %[[II:.*]] = 0 to 11 { +// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$IV1]](%[[II]]) +// CHECK-NEXT: "test.foo"(%[[IIV]], %[[IIIV]]) +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } +func @simple_loop_nest(){ + affine.for %i0 = 2 to 32 step 2 { + affine.for %i1 = 0 to 32 step 3 { + "test.foo"(%i0, %i1) : (index, index) -> () + } + } + return +} + +// ----- + +// CHECK-DAG: [[$IV00:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32 + 2)> +// CHECK-DAG: [[$IV11:#map[0-9]+]] = affine_map<(d0) -> (d0 * 2)> +// CHECK-DAG: [[$UB00:#map[0-9]+]] = affine_map<()[s0] -> ((s0 - 2) ceildiv 32)> +// CHECK-DAG: [[$UB11:#map[0-9]+]] = affine_map<()[s0] -> (s0 ceildiv 2)> + +// CHECK-LABEL: func @loop_with_unknown_upper_bound +// CHECK-SAME: (%[[ARG0:.*]]: memref, %[[ARG1:.*]]: index) +// CHECK-NEXT: %{{.*}} = constant 0 : index +// CHECK-NEXT: %[[DIM:.*]] = dim %arg0, %c0 : memref +// CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$UB00]]()[%[[DIM]]] { +// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$IV00]](%[[I]]) +// CHECK-NEXT: affine.for %[[II:.*]] = 0 to [[$UB11]]()[%[[ARG1]]] { +// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$IV11]](%[[II]]) +// CHECK-NEXT: "test.foo"(%[[IIV]], %[[IIIV]]) +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } +func @loop_with_unknown_upper_bound(%arg0: memref, %arg1: index) { + %c0 = constant 0 : index + %0 = dim %arg0, %c0 : memref + affine.for %i0 = 2 to %0 step 32 { + affine.for %i1 = 0 to %arg1 step 2 { + "test.foo"(%i0, %i1) : (index, index) -> () + } + } + return +} + +// ----- + +// CHECK-DAG: [[$OUTERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32 + 2)> +// CHECK-DAG: [[$INNERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 + 2)> +// CHECK-DAG: [[$OUTERUB:#map[0-9]+]] = affine_map<()[s0] -> ((s0 - 2) ceildiv 32)> +// CHECK-DAG: [[$INNERUB:#map[0-9]+]] = affine_map<(d0) -> (d0 - 2, 510)> + +// CHECK-LABEL: func @loop_with_multiple_upper_bounds +// CHECK-SAME: (%[[ARG0:.*]]: memref, %[[ARG1:.*]]: index) +// CHECK-NEXT: %{{.*}} = constant 0 : index +// CHECK-NEXT: %[[DIM:.*]] = dim %arg0, %c0 : memref +// CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$OUTERUB]]()[%[[DIM]]] { +// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$OUTERIV]](%[[I]]) +// CHECK-NEXT: affine.for %[[II:.*]] = 0 to min [[$INNERUB]](%[[ARG1]]) { +// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$INNERIV]](%[[II]]) +// CHECK-NEXT: "test.foo"(%[[IIV]], %[[IIIV]]) +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } +func @loop_with_multiple_upper_bounds(%arg0: memref, %arg1 : index) { + %c0 = constant 0 : index + %0 = dim %arg0, %c0 : memref + affine.for %i0 = 2 to %0 step 32{ + affine.for %i1 = 2 to min affine_map<(d0)[] -> (d0, 512)>(%arg1) { + "test.foo"(%i0, %i1) : (index, index) -> () + } + } + return +} + +// ----- + +// CHECK-DAG: [[$INTERUB:#map[0-9]+]] = affine_map<()[s0] -> (s0 ceildiv 32)> +// CHECK-DAG: [[$INTERIV:#map[0-9]+]] = affine_map<(d0) -> (d0 * 32)> +// CHECK-DAG: [[$INTRAUB:#map[0-9]+]] = affine_map<(d0, d1)[s0] -> (32, -d0 + s0)> +// CHECK-DAG: [[$INTRAIV:#map[0-9]+]] = affine_map<(d0, d1) -> (d1 + d0)> + +// CHECK-LABEL: func @tiled_matmul +// CHECK-SAME: (%[[ARG0:.*]]: memref<1024x1024xf32>, %[[ARG1:.*]]: memref<1024x1024xf32>, %[[ARG2:.*]]: memref<1024x1024xf32>) +// CHECK-NEXT: %{{.*}} = constant 0 : index +// CHECK-NEXT: %{{.*}} = constant 1 : index +// CHECK-NEXT: %[[DIM0:.*]] = dim %[[ARG0]], %{{.*}} +// CHECK-NEXT: %[[DIM1:.*]] = dim %[[ARG1]], %{{.*}} +// CHECK-NEXT: %[[DIM2:.*]] = dim %[[ARG0]], %{{.*}} +// CHECK-NEXT: affine.for %[[I:.*]] = 0 to [[$INTERUB]]()[%[[DIM0]]] { +// CHECK-NEXT: %[[IIV:.*]] = affine.apply [[$INTERIV]](%[[I]]) +// CHECK-NEXT: affine.for %[[J:.*]] = 0 to [[$INTERUB]]()[%[[DIM1]]] { +// CHECK-NEXT: %[[JIV:.*]] = affine.apply [[$INTERIV]](%[[J]]) +// CHECK-NEXT: affine.for %[[K:.*]] = 0 to [[$INTERUB]]()[%[[DIM2]]] { +// CHECK-NEXT: %[[KIV:.*]] = affine.apply [[$INTERIV]](%[[K]]) +// CHECK-NEXT: affine.for %[[II:.*]] = 0 to min [[$INTRAUB]](%[[IIV]], %[[IIV]])[%[[DIM0]]] { +// CHECK-NEXT: %[[IIIV:.*]] = affine.apply [[$INTRAIV]](%[[IIV]], %[[II]]) +// CHECK-NEXT: affine.for %[[JJ:.*]] = 0 to min [[$INTRAUB]](%[[JIV]], %[[JIV]])[%[[DIM1]]] { +// CHECK-NEXT: %[[JJIV:.*]] = affine.apply [[$INTRAIV]](%[[JIV]], %[[JJ]]) +// CHECK-NEXT: affine.for %[[KK:.*]] = 0 to min [[$INTRAUB]](%[[KIV]], %[[KIV]])[%[[DIM2]]] { +// CHECK-NEXT: %[[KKIV:.*]] = affine.apply [[$INTRAIV]](%[[KIV]], %[[KK]]) +// CHECK-NEXT: %{{.*}} = affine.load %[[ARG0]][%[[IIIV]], %[[KKIV]]] : memref<1024x1024xf32> +// CHECK-NEXT: %{{.*}} = affine.load %[[ARG1]][%[[KKIV]], %[[JJIV]]] : memref<1024x1024xf32> +// CHECK-NEXT: %{{.*}} = affine.load %[[ARG2]][%[[IIIV]], %[[JJIV]]] : memref<1024x1024xf32> +// CHECK-NEXT: %{{.*}} = mulf %9, %10 : f32 +// CHECK-NEXT: %{{.*}} = addf %11, %12 : f32 +// CHECK-NEXT: affine.store %{{.*}}, %[[ARG2]][%6, %7] : memref<1024x1024xf32> +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } +#map0 = affine_map<(d0, d1) -> (d0, d1)> +#map1 = affine_map<(d0) -> (d0)> +#map2 = affine_map<(d0)[s0] -> (d0 + 32, s0)> +#map3 = affine_map<() -> (0)> +#map4 = affine_map<()[s0] -> (s0)> + +func @tiled_matmul(%0: memref<1024x1024xf32>, %1: memref<1024x1024xf32>, %2: memref<1024x1024xf32>) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %3 = dim %0, %c0 : memref<1024x1024xf32> + %4 = dim %1, %c1 : memref<1024x1024xf32> + %5 = dim %0, %c1 : memref<1024x1024xf32> + affine.for %arg0 = 0 to %3 step 32 { + affine.for %arg1 = 0 to %4 step 32 { + affine.for %arg2 = 0 to %5 step 32 { + affine.for %arg3 = #map1(%arg0) to min #map2(%arg0)[%3] { + affine.for %arg4 = #map1(%arg1) to min #map2(%arg1)[%4] { + affine.for %arg5 = #map1(%arg2) to min #map2(%arg2)[%5] { + %6 = affine.load %0[%arg3, %arg5] : memref<1024x1024xf32> + %7 = affine.load %1[%arg5, %arg4] : memref<1024x1024xf32> + %8 = affine.load %2[%arg3, %arg4] : memref<1024x1024xf32> + %9 = mulf %6, %7 : f32 + %10 = addf %8, %9 : f32 + affine.store %10, %2[%arg3, %arg4] : memref<1024x1024xf32> + } + } + } + } + } + } + return +} -- 2.7.4