From 70da33bf30dabf9d3c1cddc8d18094f76ff860bb Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Wed, 1 Apr 2020 12:00:26 +0530 Subject: [PATCH] [MLIR] fix/update affine data copy utility for max/min bounds Fix point-wise copy generation to work with bounds that have max/min. Change structure of copy loop nest to use absolute loop indices and subtracting base from the indexes of the fast buffers. Update supporting utilities: Fix FlatAffineConstraints::getLowerAndUpperBound to look at equalities as well and for a missing division. Update unionBoundingBox to not discard common constraints (leads to a tighter system). Update MemRefRegion::getConstantBoundingSizeAndShape to add memref dimension constraints. Run removeTrivialRedundancy at the end of MemRefRegion::compute. Run single iteration loop promotion and load/store canonicalization after affine data copy (in its test pass as well). Differential Revision: https://reviews.llvm.org/D77320 --- mlir/include/mlir/Analysis/AffineStructures.h | 14 +- mlir/include/mlir/Analysis/Utils.h | 9 +- mlir/lib/Analysis/AffineStructures.cpp | 97 ++++++++++--- mlir/lib/Analysis/Utils.cpp | 40 ++++- .../Affine/Transforms/AffineDataCopyGeneration.cpp | 17 ++- mlir/lib/Transforms/Utils/LoopUtils.cpp | 127 +++++++++------- mlir/test/Dialect/Affine/affine-data-copy.mlir | 161 ++++++++++++++------- mlir/test/Dialect/Affine/dma-generate.mlir | 60 +++----- .../test/lib/Dialect/Affine/TestAffineDataCopy.cpp | 14 +- 9 files changed, 362 insertions(+), 177 deletions(-) diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h index cd31952..b8a9973 100644 --- a/mlir/include/mlir/Analysis/AffineStructures.h +++ b/mlir/include/mlir/Analysis/AffineStructures.h @@ -481,13 +481,13 @@ public: /// one; None otherwise. Optional getConstantUpperBound(unsigned pos) const; - /// Gets the lower and upper bound of the pos^th identifier treating - /// [0, offset) U [offset + num, symStartPos) as dimensions and - /// [symStartPos, getNumDimAndSymbolIds) as symbols. The returned - /// multi-dimensional maps in the pair represent the max and min of - /// potentially multiple affine expressions. The upper bound is exclusive. - /// 'localExprs' holds pre-computed AffineExpr's for all local identifiers in - /// the system. + /// Gets the lower and upper bound of the `offset` + `pos`th identifier + /// treating [0, offset) U [offset + num, symStartPos) as dimensions and + /// [symStartPos, getNumDimAndSymbolIds) as symbols, and `pos` lies in + /// [0, num). The multi-dimensional maps in the returned pair represent the + /// max and min of potentially multiple affine expressions. The upper bound is + /// exclusive. `localExprs` holds pre-computed AffineExpr's for all local + /// identifiers in the system. std::pair getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num, unsigned symStartPos, ArrayRef localExprs, diff --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h index 9b69c6f..36ba49d 100644 --- a/mlir/include/mlir/Analysis/Utils.h +++ b/mlir/include/mlir/Analysis/Utils.h @@ -220,12 +220,19 @@ struct MemRefRegion { /// i.e., the returned bounding constant holds for *any given* value of the /// symbol identifiers. The 'shape' vector is set to the corresponding /// dimension-wise bounds major to minor. We use int64_t instead of uint64_t - /// since index types can be at most int64_t. + /// since index types can be at most int64_t. `lbs` are set to the lower + /// bounds for each of the rank dimensions, and lbDivisors contains the + /// corresponding denominators for floorDivs. Optional getConstantBoundingSizeAndShape( SmallVectorImpl *shape = nullptr, std::vector> *lbs = nullptr, SmallVectorImpl *lbDivisors = nullptr) const; + /// Gets the lower and upper bound map for the dimensional identifier at + /// `pos`. + void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, + AffineMap &ubMap) const; + /// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos' /// corresponds to the position of the memref shape's dimension (major to /// minor) which matches 1:1 with the dimensional identifier positions in diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp index 947087c..f9e8bf5 100644 --- a/mlir/lib/Analysis/AffineStructures.cpp +++ b/mlir/lib/Analysis/AffineStructures.cpp @@ -1395,8 +1395,9 @@ std::pair FlatAffineConstraints::getLowerAndUpperBound( assert(getNumLocalIds() == localExprs.size() && "incorrect local exprs count"); - SmallVector lbIndices, ubIndices; - getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices); + SmallVector lbIndices, ubIndices, eqIndices; + getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices, &eqIndices, + offset, num); /// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos). auto addCoeffs = [&](ArrayRef a, SmallVectorImpl &b) { @@ -1408,10 +1409,10 @@ std::pair FlatAffineConstraints::getLowerAndUpperBound( }; SmallVector lb, ub; - SmallVector exprs; + SmallVector lbExprs; unsigned dimCount = symStartPos - num; unsigned symCount = getNumDimAndSymbolIds() - symStartPos; - exprs.reserve(lbIndices.size()); + lbExprs.reserve(lbIndices.size() + eqIndices.size()); // Lower bound expressions. for (auto idx : lbIndices) { auto ineq = getInequality(idx); @@ -1422,13 +1423,14 @@ std::pair FlatAffineConstraints::getLowerAndUpperBound( std::transform(lb.begin(), lb.end(), lb.begin(), std::negate()); auto expr = getAffineExprFromFlatForm(lb, dimCount, symCount, localExprs, context); - exprs.push_back(expr); + // expr ceildiv divisor is (expr + divisor - 1) floordiv divisor + int64_t divisor = std::abs(ineq[pos + offset]); + expr = (expr + divisor - 1).floorDiv(divisor); + lbExprs.push_back(expr); } - auto lbMap = - exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs); - exprs.clear(); - exprs.reserve(ubIndices.size()); + SmallVector ubExprs; + ubExprs.reserve(ubIndices.size() + eqIndices.size()); // Upper bound expressions. for (auto idx : ubIndices) { auto ineq = getInequality(idx); @@ -1436,11 +1438,37 @@ std::pair FlatAffineConstraints::getLowerAndUpperBound( addCoeffs(ineq, ub); auto expr = getAffineExprFromFlatForm(ub, dimCount, symCount, localExprs, context); + expr = expr.floorDiv(std::abs(ineq[pos + offset])); + // Upper bound is exclusive. + ubExprs.push_back(expr + 1); + } + + // Equalities. It's both a lower and a upper bound. + SmallVector b; + for (auto idx : eqIndices) { + auto eq = getEquality(idx); + addCoeffs(eq, b); + if (eq[pos + offset] > 0) + std::transform(b.begin(), b.end(), b.begin(), std::negate()); + + // Extract the upper bound (in terms of other coeff's + const). + auto expr = + getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context); + expr = expr.floorDiv(std::abs(eq[pos + offset])); // Upper bound is exclusive. - exprs.push_back(expr + 1); + ubExprs.push_back(expr + 1); + // Lower bound. + expr = + getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context); + expr = expr.ceilDiv(std::abs(eq[pos + offset])); + lbExprs.push_back(expr); } - auto ubMap = - exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs); + + auto lbMap = lbExprs.empty() ? AffineMap() + : AffineMap::get(dimCount, symCount, lbExprs); + + auto ubMap = ubExprs.empty() ? AffineMap() + : AffineMap::get(dimCount, symCount, ubExprs); return {lbMap, ubMap}; } @@ -1583,7 +1611,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned offset, unsigned num, tmpClone->removeRedundantInequalities(); } std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound( - pos, offset, num, getNumDimIds(), {}, context); + pos, offset, num, getNumDimIds(), /*localExprs=*/{}, context); } // If the above fails, we'll just use the constant lower bound and the @@ -2627,6 +2655,30 @@ static BoundCmpResult compareBounds(ArrayRef a, ArrayRef b) { } } // namespace +// Returns constraints that are common to both A & B. +static void getCommonConstraints(const FlatAffineConstraints &A, + const FlatAffineConstraints &B, + FlatAffineConstraints &C) { + C.reset(A.getNumDimIds(), A.getNumSymbolIds(), A.getNumLocalIds()); + // A naive O(n^2) check should be enough here given the input sizes. + for (unsigned r = 0, e = A.getNumInequalities(); r < e; ++r) { + for (unsigned s = 0, f = B.getNumInequalities(); s < f; ++s) { + if (A.getInequality(r) == B.getInequality(s)) { + C.addInequality(A.getInequality(r)); + break; + } + } + } + for (unsigned r = 0, e = A.getNumEqualities(); r < e; ++r) { + for (unsigned s = 0, f = B.getNumEqualities(); s < f; ++s) { + if (A.getEquality(r) == B.getEquality(s)) { + C.addEquality(A.getEquality(r)); + break; + } + } + } +} + // Computes the bounding box with respect to 'other' by finding the min of the // lower bounds and the max of the upper bounds along each of the dimensions. LogicalResult @@ -2639,13 +2691,19 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here"); assert(getNumLocalIds() == 0 && "local ids not supported yet here"); + // Align `other` to this. Optional otherCopy; if (!areIdsAligned(*this, otherCst)) { otherCopy.emplace(FlatAffineConstraints(otherCst)); mergeAndAlignIds(/*offset=*/numDims, this, &otherCopy.getValue()); } - const auto &other = otherCopy ? *otherCopy : otherCst; + const auto &otherAligned = otherCopy ? *otherCopy : otherCst; + + // Get the constraints common to both systems; these will be added as is to + // the union. + FlatAffineConstraints commonCst; + getCommonConstraints(*this, otherAligned, commonCst); std::vector> boundingLbs; std::vector> boundingUbs; @@ -2668,7 +2726,7 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { // TODO(bondhugula): handle union if a dimension is unbounded. return failure(); - auto otherExtent = other.getConstantBoundOnDimSize( + auto otherExtent = otherAligned.getConstantBoundOnDimSize( d, &otherLb, &otherLbFloorDivisor, &otherUb); if (!otherExtent.hasValue() || lbFloorDivisor != otherLbFloorDivisor) // TODO(bondhugula): symbolic extents when necessary. @@ -2690,7 +2748,7 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { } else { // Uncomparable - check for constant lower/upper bounds. auto constLb = getConstantLowerBound(d); - auto constOtherLb = other.getConstantLowerBound(d); + auto constOtherLb = otherAligned.getConstantLowerBound(d); if (!constLb.hasValue() || !constOtherLb.hasValue()) return failure(); std::fill(minLb.begin(), minLb.end(), 0); @@ -2706,7 +2764,7 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { } else { // Uncomparable - check for constant lower/upper bounds. auto constUb = getConstantUpperBound(d); - auto constOtherUb = other.getConstantUpperBound(d); + auto constOtherUb = otherAligned.getConstantUpperBound(d); if (!constUb.hasValue() || !constOtherUb.hasValue()) return failure(); std::fill(maxUb.begin(), maxUb.end(), 0); @@ -2736,6 +2794,11 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) { addInequality(boundingLbs[d]); addInequality(boundingUbs[d]); } + + // Add the constraints that were common to both systems. + append(commonCst); + removeTrivialRedundancy(); + // TODO(mlir-team): copy over pure symbolic constraints from this and 'other' // over to the union (since the above are just the union along dimensions); we // shouldn't be discarding any other constraints on the symbols. diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp index bef227e..67d0138 100644 --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -64,7 +64,6 @@ ComputationSliceState::getAsConstraints(FlatAffineConstraints *cst) { assert(cst->containsId(value) && "value expected to be present"); if (isValidSymbol(value)) { // Check if the symbol is a constant. - if (auto cOp = dyn_cast_or_null(value.getDefiningOp())) cst->setIdToConstant(value, cOp.getValue()); } else if (auto loop = getForInductionVarOwner(value)) { @@ -103,6 +102,20 @@ Optional MemRefRegion::getConstantBoundingSizeAndShape( assert(rank == cst.getNumDimIds() && "inconsistent memref region"); + // Use a copy of the region constraints that has upper/lower bounds for each + // memref dimension with static size added to guard against potential + // over-approximation from projection or union bounding box. We may not add + // this on the region itself since they might just be redundant constraints + // that will need non-trivials means to eliminate. + FlatAffineConstraints cstWithShapeBounds(cst); + for (unsigned r = 0; r < rank; r++) { + cstWithShapeBounds.addConstantLowerBound(r, 0); + int64_t dimSize = memRefType.getDimSize(r); + if (ShapedType::isDynamic(dimSize)) + continue; + cstWithShapeBounds.addConstantUpperBound(r, dimSize - 1); + } + // Find a constant upper bound on the extent of this memref region along each // dimension. int64_t numElements = 1; @@ -110,7 +123,8 @@ Optional MemRefRegion::getConstantBoundingSizeAndShape( int64_t lbDivisor; for (unsigned d = 0; d < rank; d++) { SmallVector lb; - Optional diff = cst.getConstantBoundOnDimSize(d, &lb, &lbDivisor); + Optional diff = + cstWithShapeBounds.getConstantBoundOnDimSize(d, &lb, &lbDivisor); if (diff.hasValue()) { diffConstant = diff.getValue(); assert(lbDivisor > 0); @@ -122,7 +136,7 @@ Optional MemRefRegion::getConstantBoundingSizeAndShape( return None; diffConstant = dimSize; // Lower bound becomes 0. - lb.resize(cst.getNumSymbolIds() + 1, 0); + lb.resize(cstWithShapeBounds.getNumSymbolIds() + 1, 0); lbDivisor = 1; } numElements *= diffConstant; @@ -138,6 +152,25 @@ Optional MemRefRegion::getConstantBoundingSizeAndShape( return numElements; } +void MemRefRegion::getLowerAndUpperBound(unsigned pos, AffineMap &lbMap, + AffineMap &ubMap) const { + assert(pos < cst.getNumDimIds() && "invalid position"); + auto memRefType = memref.getType().cast(); + unsigned rank = memRefType.getRank(); + + assert(rank == cst.getNumDimIds() && "inconsistent memref region"); + + auto boundPairs = cst.getLowerAndUpperBound( + pos, /*offset=*/0, /*num=*/rank, cst.getNumDimAndSymbolIds(), + /*localExprs=*/{}, memRefType.getContext()); + lbMap = boundPairs.first; + ubMap = boundPairs.second; + assert(lbMap && "lower bound for a region must exist"); + assert(ubMap && "upper bound for a region must exist"); + assert(lbMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank); + assert(ubMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank); +} + LogicalResult MemRefRegion::unionBoundingBox(const MemRefRegion &other) { assert(memref == other.memref); return cst.unionBoundingBox(*other.getConstraints()); @@ -304,6 +337,7 @@ LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth, cst.addConstantUpperBound(r, dimSize - 1); } } + cst.removeTrivialRedundancy(); LLVM_DEBUG(llvm::dbgs() << "Memory region:\n"); LLVM_DEBUG(cst.dump()); diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp index 3101a2e..3ba61bc 100644 --- a/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp @@ -21,8 +21,9 @@ #include "mlir/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" #include "llvm/ADT/MapVector.h" @@ -264,7 +265,19 @@ void AffineDataCopyGeneration::runOnFunction() { runOnBlock(&block, copyNests); // Promote any single iteration loops in the copy nests. - for (auto nest : copyNests) { + for (auto nest : copyNests) nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); + + // Promoting single iteration loops could lead to simplification of + // load's/store's. We will run canonicalization patterns on load/stores. + // TODO: this whole function load/store canonicalization should be replaced by + // canonicalization that is limited to only the load/store ops + // introduced/touched by this pass (those inside 'copyNests'). This would be + // possible once the necessary support is available in the pattern rewriter. + if (!copyNests.empty()) { + OwningRewritePatternList patterns; + AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext()); + AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext()); + applyPatternsGreedily(f, std::move(patterns)); } } diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index f07d0bb..0c05383 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -1458,58 +1458,79 @@ static void getMultiLevelStrides(const MemRefRegion ®ion, } /// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and -/// returns the outermost AffineForOp of the copy loop nest. `memIndicesStart' -/// holds the lower coordinates of the region in the original memref to copy -/// in/out. If `copyOut' is true, generates a copy-out; otherwise a copy-in. -static AffineForOp generatePointWiseCopy(Location loc, Value memref, - Value fastMemRef, - AffineMap memAffineMap, - ArrayRef memIndicesStart, - ArrayRef fastBufferShape, - bool isCopyOut, OpBuilder b) { - assert(!memIndicesStart.empty() && "only 1-d or more memrefs"); - - // The copy-in nest is generated as follows as an example for a 2-d region: - // for x = ... - // for y = ... - // fast_buf[x][y] = buf[mem_x + x][mem_y + y] - - SmallVector fastBufIndices, memIndices; +/// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and +/// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper +/// bound information for the copy loop nest. `fastBufOffsets` contain the +/// expressions to be subtracted out from the respective copy loop iterators in +/// order to index the fast buffer. If `copyOut' is true, generates a copy-out; +/// otherwise a copy-in. Builder `b` should be set to the point the copy nest is +/// inserted. +// +/// The copy-in nest is generated as follows as an example for a 2-d region: +/// for x = ... +/// for y = ... +/// fast_buf[x - offset_x][y - offset_y] = memref[x][y] +/// +static AffineForOp +generatePointWiseCopy(Location loc, Value memref, Value fastMemRef, + ArrayRef lbMaps, ArrayRef lbOperands, + ArrayRef ubMaps, ArrayRef ubOperands, + ArrayRef fastBufOffsets, bool isCopyOut, + OpBuilder b) { + assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) { + return lbMap.getNumInputs() == lbOperands.size(); + })); + assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) { + return ubMap.getNumInputs() == ubOperands.size(); + })); + + unsigned rank = memref.getType().cast().getRank(); + assert(lbMaps.size() == rank && "wrong number of lb maps"); + assert(ubMaps.size() == rank && "wrong number of ub maps"); + + SmallVector memIndices; + SmallVector fastBufExprs; + SmallVector fastBufMapOperands; AffineForOp copyNestRoot; - for (unsigned d = 0, e = fastBufferShape.size(); d < e; ++d) { - auto forOp = b.create(loc, 0, fastBufferShape[d]); + for (unsigned d = 0; d < rank; ++d) { + auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d], + ubOperands, ubMaps[d]); if (d == 0) copyNestRoot = forOp; + b = forOp.getBodyBuilder(); - fastBufIndices.push_back(forOp.getInductionVar()); - - Value memBase = - (memAffineMap == b.getMultiDimIdentityMap(memAffineMap.getNumDims())) - ? memIndicesStart[d] - : b.create( - loc, - AffineMap::get(memAffineMap.getNumDims(), - memAffineMap.getNumSymbols(), - memAffineMap.getResult(d)), - memIndicesStart); - - // Construct the subscript for the slow memref being copied. - auto memIndex = b.create( - loc, - AffineMap::get(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)), - ValueRange({memBase, forOp.getInductionVar()})); - memIndices.push_back(memIndex); + + auto fastBufOffsetMap = + AffineMap::get(lbOperands.size(), 0, {fastBufOffsets[d]}); + auto offset = b.create(loc, fastBufOffsetMap, lbOperands); + + // Construct the subscript for the fast memref being copied into/from: + // x - offset_x. + fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) - + b.getAffineDimExpr(2 * d)); + fastBufMapOperands.push_back(offset); + fastBufMapOperands.push_back(forOp.getInductionVar()); + + // Subscript for the slow memref being copied. + memIndices.push_back(forOp.getInductionVar()); } + auto fastBufMap = AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs); + fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands); + fastBufMap = simplifyAffineMap(fastBufMap); + canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands); + if (!isCopyOut) { // Copy in. auto load = b.create(loc, memref, memIndices); - b.create(loc, load, fastMemRef, fastBufIndices); + b.create(loc, load, fastMemRef, fastBufMap, + fastBufMapOperands); return copyNestRoot; } // Copy out. - auto load = b.create(loc, fastMemRef, fastBufIndices); + auto load = + b.create(loc, fastMemRef, fastBufMap, fastBufMapOperands); b.create(loc, load, memref, memIndices); return copyNestRoot; } @@ -1600,6 +1621,10 @@ static LogicalResult generateCopy( return success(); } + SmallVector lbMaps(rank), ubMaps(rank); + for (unsigned i = 0; i < rank; ++i) + region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]); + const FlatAffineConstraints *cst = region.getConstraints(); // 'regionSymbols' hold values that this memory region is symbolic/parametric // on; these typically include loop IVs surrounding the level at which the @@ -1613,15 +1638,14 @@ static LogicalResult generateCopy( // along the corresponding dimension. // Index start offsets for faster memory buffer relative to the original. - SmallVector offsets; - offsets.reserve(rank); + SmallVector fastBufOffsets; + fastBufOffsets.reserve(rank); for (unsigned d = 0; d < rank; d++) { assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size"); AffineExpr offset = top.getAffineConstantExpr(0); - for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) { + for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) offset = offset + lbs[d][j] * top.getAffineDimExpr(j); - } assert(lbDivisors[d] > 0); offset = (offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]); @@ -1648,7 +1672,7 @@ static LogicalResult generateCopy( // Record the offsets since they are needed to remap the memory accesses of // the original memref further below. - offsets.push_back(offset); + fastBufOffsets.push_back(offset); } // The faster memory space buffer. @@ -1716,9 +1740,11 @@ static LogicalResult generateCopy( if (!copyOptions.generateDma) { // Point-wise copy generation. - auto copyNest = generatePointWiseCopy(loc, memref, fastMemRef, memAffineMap, - memIndices, fastBufferShape, - /*isCopyOut=*/region.isWrite(), b); + auto copyNest = + generatePointWiseCopy(loc, memref, fastMemRef, lbMaps, + /*lbOperands=*/regionSymbols, ubMaps, + /*ubOperands=*/regionSymbols, fastBufOffsets, + /*isCopyOut=*/region.isWrite(), b); // Record this so that we can skip it from yet another copy. copyNests.insert(copyNest); @@ -1790,7 +1816,7 @@ static LogicalResult generateCopy( // which the memref region is parametric); then those corresponding to // the memref's original indices follow. auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i); - remapExprs.push_back(dimExpr - offsets[i]); + remapExprs.push_back(dimExpr - fastBufOffsets[i]); } auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs); @@ -1925,7 +1951,8 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin, // Compute the MemRefRegion accessed. auto region = std::make_unique(opInst->getLoc()); - if (failed(region->compute(opInst, copyDepth))) { + if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr, + /*addMemRefDimBounds=*/false))) { LLVM_DEBUG(llvm::dbgs() << "Error obtaining memory region: semi-affine maps?\n"); LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n"); @@ -2051,7 +2078,7 @@ uint64_t mlir::affineDataCopyGenerate(Block::iterator begin, if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) { StringRef str = "Total size of all copy buffers' for this block " "exceeds fast memory capacity\n"; - block->getParentOp()->emitError(str); + block->getParentOp()->emitWarning(str); } return totalCopyBuffersSizeInBytes; diff --git a/mlir/test/Dialect/Affine/affine-data-copy.mlir b/mlir/test/Dialect/Affine/affine-data-copy.mlir index 48e7908a..de8889e 100644 --- a/mlir/test/Dialect/Affine/affine-data-copy.mlir +++ b/mlir/test/Dialect/Affine/affine-data-copy.mlir @@ -17,12 +17,9 @@ #id = affine_map<(d0) -> (d0)> #ub = affine_map<(d0) -> (d0 + 128)> -// Map used to index the original memref while copying. -// CHECK-DAG: [[MEM_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)> // Map used to index the buffer while computing. // CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)> // CHECK-DAG: [[MAP_PLUS_128:map[0-9]+]] = affine_map<(d0) -> (d0 + 128)> -// CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)> // CHECK-LABEL: func @matmul // FILTER-LABEL: func @matmul @@ -50,41 +47,34 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40 // Buffers of size 128x128 get created here for all three matrices. -// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 { -// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 { +// CHECK: affine.for %[[I:.*]] = 0 to 4096 step 128 { +// CHECK: affine.for %[[J:.*]] = 0 to 4096 step 128 { // CHECK: [[BUFC:%[0-9]+]] = alloc() : memref<128x128xf32> - // The result matrix's copy gets hoisted out. // Result matrix copy-in. -// CHECK: affine.for %{{.*}} = 0 to 128 { -// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}}) -// CHECK: affine.for %{{.*}} = 0 to 128 { -// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}}) +// CHECK: affine.for %[[II:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) { +// CHECK: affine.for %[[JJ:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) { // CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32> -// CHECK: affine.store %{{.*}}, [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32> +// CHECK: affine.store %{{.*}}, [[BUFC]][-%[[I]] + %[[II]], -%[[J]] + %[[JJ]]] : memref<128x128xf32> // CHECK: } // CHECK: } // LHS matrix copy-in. -// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 { +// CHECK: affine.for %[[K:.*]] = 0 to 4096 step 128 { // CHECK: [[BUFA:%[0-9]+]] = alloc() : memref<128x128xf32> -// CHECK: affine.for %{{.*}} = 0 to 128 { -// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}}) -// CHECK: affine.for %{{.*}} = 0 to 128 { -// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}}) +// CHECK: affine.for %[[II:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) { +// CHECK: affine.for %[[KK:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) { // CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32> -// CHECK: affine.store %{{.*}}, [[BUFA]][%{{.*}}, %{{.*}}] : memref<128x128xf32> +// CHECK: affine.store %{{.*}}, [[BUFA]][-%[[I]] + %[[II]], -%[[K]] + %[[KK]]] : memref<128x128xf32> // CHECK: } // CHECK: } // RHS matrix copy-in. // CHECK: [[BUFB:%[0-9]+]] = alloc() : memref<128x128xf32> -// CHECK: affine.for %{{.*}} = 0 to 128 { -// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}}) -// CHECK: affine.for %{{.*}} = 0 to 128 { -// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}}) +// CHECK: affine.for %[[KK:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) { +// CHECK: affine.for %[[JJ:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) { // CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32> -// CHECK: affine.store %{{.*}}, [[BUFB]][%{{.*}}, %{{.*}}] : memref<128x128xf32> +// CHECK: affine.store %{{.*}}, [[BUFB]][-%[[K]] + %[[KK]], -%[[J]] + %[[JJ]]] : memref<128x128xf32> // CHECK: } // CHECK: } @@ -104,16 +94,12 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40 // CHECK: dealloc [[BUFB]] : memref<128x128xf32> // CHECK: dealloc [[BUFA]] : memref<128x128xf32> // CHECK: } -// CHECK: affine.apply #map{{.*}}(%{{.*}}, %{{.*}}) -// CHECK: affine.apply #map{{.*}}(%{{.*}}, %{{.*}}) // Result matrix copy out. -// CHECK: affine.for %{{.*}} = 0 to 128 { -// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}}) -// CHECK: affine.for %{{.*}} = 0 to 128 { -// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}}) -// CHECK: [[BUFA]] = affine.load [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32> -// CHECK: store [[BUFA]], %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32> +// CHECK: affine.for %{{.*}} = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) { +// CHECK: affine.for %{{.*}} = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) { +// CHECK: affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32> +// CHECK: store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32> // CHECK: } // CHECK: } // CHECK: dealloc [[BUFC]] : memref<128x128xf32> @@ -125,15 +111,15 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40 // FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { // FILTER: alloc() : memref<128x4096xf32> // FILTER-NOT: alloc() -// FILTER: affine.for %{{.*}} = 0 to 128 { +// FILTER: affine.for // FILTER: affine.for %{{.*}} = 0 to 4096 { // FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { // FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 { // FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { // FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { // FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { -// FILTER: dealloc %1 : memref<128x4096xf32> -// FILTER-NOT: dealloc %1 : memref<128x4096xf32> +// FILTER: dealloc %{{.*}} : memref<128x4096xf32> +// FILTER-NOT: dealloc %{{.*}} : memref<128x4096xf32> // ----- @@ -141,10 +127,10 @@ func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<40 // This test case will lead to single element buffers. These are eventually // expected to be turned into registers via alloca and mem2reg. // -// CHECK-SMALL-LABEL: func @foo -// FILTER-LABEL: func @foo -// MEMREF_REGION-LABEL: func @foo -func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> { +// CHECK-SMALL-LABEL: func @single_elt_buffers +// FILTER-LABEL: func @single_elt_buffers +// MEMREF_REGION-LABEL: func @single_elt_buffers +func @single_elt_buffers(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> { affine.for %i = 0 to 1024 { affine.for %j = 0 to 1024 { affine.for %k = 0 to 1024 { @@ -159,32 +145,20 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem } // CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 { // CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 { -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}}) -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}}) // CHECK-SMALL: alloc() : memref<1x1xf32> -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}}) -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}}) // CHECK-SMALL: affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> -// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32> +// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> // CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 { -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}}) -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}}) // CHECK-SMALL: alloc() : memref<1x1xf32> -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}}) -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}}) // CHECK-SMALL: affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> -// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32> +// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> // CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32> // CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32> // CHECK-SMALL: addf %{{.*}}, %{{.*}} : f32 // CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32> // CHECK-SMALL: dealloc %{{.*}} : memref<1x1xf32> // CHECK-SMALL: } -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}}) -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}}) -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}}) -// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}}) -// CHECK-SMALL: affine.load %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32> +// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32> // CHECK-SMALL: affine.store %{{.*}}, %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32> // CHECK-SMALL: dealloc %{{.*}} : memref<1x1xf32> // CHECK-SMALL: } @@ -210,8 +184,89 @@ func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: mem // MEMREF_REGION-NOT: alloc() // MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { // MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { -// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION: } +// MEMREF_REGION: } +// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { // MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { // MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { // MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32> // MEMREF_REGION-NOT: dealloc +// MEMREF_REGION-NEXT: return + +// ----- + +// This pattern typically appears with tiling with tile sizes that don't divide +// the loop trip counts. + +#map_ub = affine_map<(d0) -> (4096, d0 + 100)> + +// CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)> +// CHECK-DAG: [[MAP_MIN_UB1:map[0-9]+]] = affine_map<(d0) -> (d0 + 100, 4096)> +// CHECK-DAG: [[MAP_MIN_UB2:map[0-9]+]] = affine_map<(d0) -> (4096, d0 + 100)> + +// CHECK-LABEL: func @min_upper_bound +func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> { + affine.for %i = 0 to 4096 step 100 { + affine.for %ii = affine_map<(d0) -> (d0)>(%i) to min #map_ub(%i) { + %5 = affine.load %A[%ii] : memref<4096xf32> + %6 = mulf %5, %5 : f32 + affine.store %6, %A[%ii] : memref<4096xf32> + } + } + return %A : memref<4096xf32> +} +// CHECK: affine.for %[[IV1:.*]] = 0 to 4096 step 100 +// CHECK-NEXT: %[[BUF:.*]] = alloc() : memref<100xf32> +// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) { +// CHECK-NEXT: affine.load %{{.*}}[%[[IV2]]] : memref<4096xf32> +// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32> +// CHECK-NEXT: } +// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB2]](%[[IV1]]) { +// CHECK-NEXT: affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32> +// CHECK-NEXT: mulf +// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32> +// CHECK-NEXT: } +// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) { +// CHECK-NEXT: affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32> +// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[IV2]]] : memref<4096xf32> +// CHECK-NEXT: } +// CHECK-NEXT: dealloc %[[BUF]] : memref<100xf32> +// CHECK-NEXT: } + +// ----- + +// Lower bound is a max; upper bound is a min. This pattern typically appears +// with multi-level tiling when the tile sizes used don't divide loop trip +// counts. + +#lb = affine_map<(d0, d1) -> (d0 * 512, d1 * 6)> +#ub = affine_map<(d0, d1) -> (d0 * 512 + 512, d1 * 6 + 6)> + +// CHECK-DAG: #[[LB:.*]] = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)> +// CHECK-DAG: #[[UB:.*]] = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)> + +// CHECK-LABEL: max_lower_bound(%{{.*}}: memref<2048x516xf64>, +// CHECK-SAME: [[i:arg[0-9]+]] +// CHECK-SAME: [[j:arg[0-9]+]] +func @max_lower_bound(%M: memref<2048x516xf64>, %i : index, %j : index) { + affine.for %ii = 0 to 2048 { + affine.for %jj = max #lb(%i, %j) to min #ub(%i, %j) { + affine.load %M[%ii, %jj] : memref<2048x516xf64> + } + } + return +} + +// CHECK: %[[BUF=.*]] = alloc() : memref<2048x6xf64> +// CHECK-NEXT: affine.for %[[ii:.*]] = 0 to 2048 { +// CHECK-NEXT: affine.for %[[jj:.*]] = max #[[LB]]()[%[[i]], %[[j]]] to min #[[UB]]()[%[[i]], %[[j]]] { +// CHECK-NEXT: affine.load %{{.*}}[%[[ii]], %[[jj]]] : memref<2048x516xf64> +// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][%[[ii]], %[[jj]] - symbol(%[[j]]) * 6] : memref<2048x6xf64> +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: affine.for %[[ii_:.*]] = 0 to 2048 { +// CHECK-NEXT: affine.for %[[jj_:.*]] = max #[[LB]]()[%{{.*}}, %{{.*}}] to min #[[UB]]()[%{{.*}}, %{{.*}}] { +// CHECK-NEXT: affine.load %[[BUF]][%[[ii_]], %[[jj_]] - symbol(%[[j]]) * 6] : memref<2048x6xf64> +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: dealloc %[[BUF]] : memref<2048x6xf64> diff --git a/mlir/test/Dialect/Affine/dma-generate.mlir b/mlir/test/Dialect/Affine/dma-generate.mlir index 3b90cc6..6afbb16 100644 --- a/mlir/test/Dialect/Affine/dma-generate.mlir +++ b/mlir/test/Dialect/Affine/dma-generate.mlir @@ -13,7 +13,6 @@ // ----- // Index of the buffer for the second DMA is remapped. -// CHECK-DAG: [[MAP_PLUS_256:#map[0-9]+]] = affine_map<(d0) -> (d0 + 256)> // CHECK-DAG: [[MAP0:#map[0-9]+]] = affine_map<(d0) -> (d0)> // CHECK-LABEL: func @loop_nest_1d() { @@ -36,14 +35,13 @@ func @loop_nest_1d() { // Second DMA transfer. // CHECK: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<512xf32>, memref<256xf32, 2>, memref<1xi32> // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> - // CHECK: affine.for %{{.*}} = 0 to 256 { + // CHECK: affine.for %[[IV:.*]] = 0 to 256 { // CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2> - // CHECK: affine.apply [[MAP_PLUS_256]](%{{.*}}) - // Buffer for '%{{.*}}' in faster memref space is smaller size: 256xf32 - // Affine map for 'affine.load %{{.*}}' is composed: %{{.*}} + 256 - 256 = %{{.*}}. - // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2> + // Buffer for '%{{.*}}' in faster memref space is of smaller size: 256xf32 + // Affine map for load on B is composed and becomes identity. + // CHECK: affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2> // Already in faster memory space. - // CHECK: affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2> + // CHECK: affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2> // CHECK-NEXT: } // CHECK-NEXT: dealloc %{{.*}} : memref<1xi32> // CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2> @@ -83,19 +81,16 @@ func @loop_nest_1d() { // CHECK-NEXT: affine.for %{{.*}} = 0 to 32 { // CHECK-NEXT: affine.for %{{.*}} = 0 to 32 { // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { -// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}}) -// CHECK-NEXT: %{{.*}} = affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> +// CHECK: affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> // CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> () // CHECK-NEXT: } // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { -// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}}) -// CHECK-NEXT: affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> +// CHECK: affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> // CHECK-NEXT: "bar"(%{{.*}}) : (f32) -> () // CHECK-NEXT: } // CHECK-NEXT: affine.for %{{.*}} = 0 to 16 { // CHECK-NEXT: "abc_compute"() : () -> f32 -// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}}) -// CHECK-NEXT: affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> +// CHECK: affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> // CHECK-NEXT: "addf32"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32 // CHECK-NEXT: affine.store %{{.*}}, [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2> // CHECK-NEXT: } @@ -155,8 +150,7 @@ func @loop_nest_high_d(%A: memref<512 x 32 x f32>, // CHECK-LABEL: func @loop_nest_modulo() { // CHECK: alloc() : memref<256x8xf32> // CHECK-NEXT: affine.for %{{.*}} = 0 to 32 step 4 { -// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.*}}) -// CHECK-NEXT: alloc() : memref<1x2xf32, 2> +// CHECK: alloc() : memref<1x2xf32, 2> // CHECK-NEXT: alloc() : memref<1xi32> // Composition of the affine map for '%{{.*}}' causes '%{{.*}}' to be added as a symbol. // CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, 0], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256x8xf32>, memref<1x2xf32, 2>, memref<1xi32> @@ -231,8 +225,6 @@ func @dma_constant_dim_access(%A : memref<100x100xf32>) { // ----- -// CHECK-DAG: [[MAP_SYM_SHIFT:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 + s0 + s1)> - // CHECK-LABEL: func @dma_with_symbolic_accesses func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) { %N = constant 9 : index @@ -247,10 +239,9 @@ func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) { // CHECK-NEXT: alloc() : memref<1xi32> // CHECK-NEXT: affine.dma_start %{{.*}}[0, symbol(%{{.*}}) + 9], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} -// CHECK-NEXT: affine.for %{{.*}} = 0 to 100 { -// CHECK-NEXT: affine.for %{{.*}} = 0 to 100 { -// CHECK-NEXT: affine.apply [[MAP_SYM_SHIFT]](%{{.*}}, %{{.*}})[%{{.*}}, %{{.*}}] -// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x100xf32, 2> +// CHECK-NEXT: affine.for %[[IV0:.*]] = 0 to 100 { +// CHECK-NEXT: affine.for %[[IV1:.*]] = 0 to 100 { +// CHECK: affine.load %{{.*}}[%[[IV0]], %[[IV1]]] : memref<100x100xf32, 2> // CHECK-NEXT: } // CHECK-NEXT: } // CHECK: return @@ -317,11 +308,6 @@ func @dma_memref_3d(%arg0: memref<1024x1024x1024xf32>) { // ----- -// CHECK-DAG: [[MAP_PLUS_64:#map[0-9]+]] = affine_map<(d0) -> (d0 + 64)> -// CHECK-DAG: [[MAP_PLUS_128:#map[0-9]+]] = affine_map<(d0) -> (d0 + 128)> -// CHECK-DAG: [[MAP_PLUS_2:#map[0-9]+]] = affine_map<(d0) -> (d0 + 2)> -// CHECK-DAG: [[MAP_PLUS_192:#map[0-9]+]] = affine_map<(d0) -> (d0 + 192)> - // The first load accesses ([2,258), [128,384)) // The second load accesses ([64,320), [2,258)) // The first store writes to ([2,258), [192,448)) @@ -359,15 +345,9 @@ func @multi_load_store_union() { // CHECK-NEXT: alloc() : memref<1xi32> // CHECK-NEXT: affine.for %{{.*}} = 0 to 256 { // CHECK-NEXT: affine.for %{{.*}} = 0 to 256 { -// CHECK-NEXT: affine.apply [[MAP_PLUS_64]](%{{.*}}) -// CHECK-NEXT: affine.apply [[MAP_PLUS_128]](%{{.*}}) -// CHECK-NEXT: affine.apply [[MAP_PLUS_2]](%{{.*}}) -// CHECK-NEXT: affine.apply [[MAP_PLUS_2]](%{{.*}}) -// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2> +// CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2> // CHECK-NEXT: affine.load %{{.*}}[%{{.*}} + 62, %{{.*}}] : memref<382x446xf32, 2> -// CHECK-NEXT: affine.apply [[MAP_PLUS_128]](%{{.*}}) -// CHECK-NEXT: affine.apply [[MAP_PLUS_192]](%{{.*}}) -// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2> +// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2> // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} + 126, %{{.*}}] : memref<382x446xf32, 2> // CHECK-NEXT: } // CHECK-NEXT: } @@ -468,9 +448,8 @@ func @relative_loop_bounds(%arg0: memref<1027xf32>) { // CHECK: [[BUF:%[0-9]+]] = alloc() : memref<1027xf32, 2> // CHECK-NEXT: [[MEM:%[0-9]+]] = alloc() : memref<1xi32> // CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 { -// CHECK-NEXT: affine.for %{{.*}} = {{#map[0-9]+}}(%{{.*}}) to {{#map[0-9]+}}(%{{.*}}) { -// CHECK-NEXT: constant 0.000000e+00 : f32 -// CHECK-NEXT: affine.store %{{.*}}, [[BUF]][%{{.*}}] : memref<1027xf32, 2> +// CHECK-NEXT: affine.for %[[I2:.*]] = {{#map[0-9]+}}(%{{.*}}) to {{#map[0-9]+}}(%{{.*}}) { +// CHECK: affine.store %{{.*}}, [[BUF]][%[[I2]]] : memref<1027xf32, 2> // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: affine.dma_start [[BUF]][%{{.*}}], %{{.*}}[%{{.*}}], [[MEM]][%{{.*}}], %{{.*}} : memref<1027xf32, 2>, memref<1027xf32>, memref<1xi32> @@ -478,9 +457,6 @@ func @relative_loop_bounds(%arg0: memref<1027xf32>) { // ----- -// CHECK-DAG: [[MAP_READ_OFFSET:#map[0-9]+]] = affine_map<(d0) -> (d0 + 100)> -// CHECK-DAG: [[MAP_WRITE_OFFSET:#map[0-9]+]] = affine_map<(d0) -> (d0 + 25)> - func @test_read_write_region_union() { %0 = alloc() : memref<256xf32> affine.for %i0 = 0 to 10 { @@ -503,9 +479,7 @@ func @test_read_write_region_union() { // CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> // CHECK-NEXT: alloc() : memref<1xi32> // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 { -// CHECK-NEXT: affine.apply [[MAP_READ_OFFSET]](%{{.*}}) -// CHECK-NEXT: affine.apply [[MAP_WRITE_OFFSET]](%{{.*}}) -// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2> +// CHECK: affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2> // CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<85xf32, 2> // CHECK-NEXT: } // CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<85xf32, 2>, memref<256xf32>, memref<1xi32> diff --git a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp index 35c374c..9e3b343 100644 --- a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp +++ b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp @@ -13,6 +13,7 @@ #include "mlir/Analysis/Utils.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" @@ -76,8 +77,8 @@ void TestAffineDataCopy::runOnFunction() { /*fastMemorySpace=*/0, /*tagMemorySpace=*/0, /*fastMemCapacityBytes=*/32 * 1024 * 1024UL}; + DenseSet copyNests; if (clMemRefFilter) { - DenseSet copyNests; affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests); } else if (clTestGenerateCopyForMemRegion) { CopyGenerateResult result; @@ -85,6 +86,17 @@ void TestAffineDataCopy::runOnFunction() { region.compute(load, /*loopDepth=*/0); generateCopyForMemRegion(region, loopNest, copyOptions, result); } + + // Promote any single iteration loops in the copy nests. + for (auto nest : copyNests) + nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); + + // Promoting single iteration loops could lead to simplification + // of load's/store's. We will run the canonicalization patterns again. + OwningRewritePatternList patterns; + AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext()); + AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext()); + applyPatternsGreedily(getFunction(), std::move(patterns)); } namespace mlir { -- 2.7.4