Fix point-wise copy generation to work with bounds that have max/min.
Change structure of copy loop nest to use absolute loop indices and
subtracting base from the indexes of the fast buffers. Update supporting
utilities: Fix FlatAffineConstraints::getLowerAndUpperBound to look at
equalities as well and for a missing division. Update unionBoundingBox
to not discard common constraints (leads to a tighter system). Update
MemRefRegion::getConstantBoundingSizeAndShape to add memref dimension
constraints. Run removeTrivialRedundancy at the end of
MemRefRegion::compute. Run single iteration loop promotion and
load/store canonicalization after affine data copy (in its test pass as
well).
Differential Revision: https://reviews.llvm.org/D77320
/// one; None otherwise.
Optional<int64_t> getConstantUpperBound(unsigned pos) const;
- /// Gets the lower and upper bound of the pos^th identifier treating
- /// [0, offset) U [offset + num, symStartPos) as dimensions and
- /// [symStartPos, getNumDimAndSymbolIds) as symbols. The returned
- /// multi-dimensional maps in the pair represent the max and min of
- /// potentially multiple affine expressions. The upper bound is exclusive.
- /// 'localExprs' holds pre-computed AffineExpr's for all local identifiers in
- /// the system.
+ /// Gets the lower and upper bound of the `offset` + `pos`th identifier
+ /// treating [0, offset) U [offset + num, symStartPos) as dimensions and
+ /// [symStartPos, getNumDimAndSymbolIds) as symbols, and `pos` lies in
+ /// [0, num). The multi-dimensional maps in the returned pair represent the
+ /// max and min of potentially multiple affine expressions. The upper bound is
+ /// exclusive. `localExprs` holds pre-computed AffineExpr's for all local
+ /// identifiers in the system.
std::pair<AffineMap, AffineMap>
getLowerAndUpperBound(unsigned pos, unsigned offset, unsigned num,
unsigned symStartPos, ArrayRef<AffineExpr> localExprs,
/// i.e., the returned bounding constant holds for *any given* value of the
/// symbol identifiers. The 'shape' vector is set to the corresponding
/// dimension-wise bounds major to minor. We use int64_t instead of uint64_t
- /// since index types can be at most int64_t.
+ /// since index types can be at most int64_t. `lbs` are set to the lower
+ /// bounds for each of the rank dimensions, and lbDivisors contains the
+ /// corresponding denominators for floorDivs.
Optional<int64_t> getConstantBoundingSizeAndShape(
SmallVectorImpl<int64_t> *shape = nullptr,
std::vector<SmallVector<int64_t, 4>> *lbs = nullptr,
SmallVectorImpl<int64_t> *lbDivisors = nullptr) const;
+ /// Gets the lower and upper bound map for the dimensional identifier at
+ /// `pos`.
+ void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap,
+ AffineMap &ubMap) const;
+
/// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos'
/// corresponds to the position of the memref shape's dimension (major to
/// minor) which matches 1:1 with the dimensional identifier positions in
assert(getNumLocalIds() == localExprs.size() &&
"incorrect local exprs count");
- SmallVector<unsigned, 4> lbIndices, ubIndices;
- getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices);
+ SmallVector<unsigned, 4> lbIndices, ubIndices, eqIndices;
+ getLowerAndUpperBoundIndices(pos + offset, &lbIndices, &ubIndices, &eqIndices,
+ offset, num);
/// Add to 'b' from 'a' in set [0, offset) U [offset + num, symbStartPos).
auto addCoeffs = [&](ArrayRef<int64_t> a, SmallVectorImpl<int64_t> &b) {
};
SmallVector<int64_t, 8> lb, ub;
- SmallVector<AffineExpr, 4> exprs;
+ SmallVector<AffineExpr, 4> lbExprs;
unsigned dimCount = symStartPos - num;
unsigned symCount = getNumDimAndSymbolIds() - symStartPos;
- exprs.reserve(lbIndices.size());
+ lbExprs.reserve(lbIndices.size() + eqIndices.size());
// Lower bound expressions.
for (auto idx : lbIndices) {
auto ineq = getInequality(idx);
std::transform(lb.begin(), lb.end(), lb.begin(), std::negate<int64_t>());
auto expr =
getAffineExprFromFlatForm(lb, dimCount, symCount, localExprs, context);
- exprs.push_back(expr);
+ // expr ceildiv divisor is (expr + divisor - 1) floordiv divisor
+ int64_t divisor = std::abs(ineq[pos + offset]);
+ expr = (expr + divisor - 1).floorDiv(divisor);
+ lbExprs.push_back(expr);
}
- auto lbMap =
- exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);
- exprs.clear();
- exprs.reserve(ubIndices.size());
+ SmallVector<AffineExpr, 4> ubExprs;
+ ubExprs.reserve(ubIndices.size() + eqIndices.size());
// Upper bound expressions.
for (auto idx : ubIndices) {
auto ineq = getInequality(idx);
addCoeffs(ineq, ub);
auto expr =
getAffineExprFromFlatForm(ub, dimCount, symCount, localExprs, context);
+ expr = expr.floorDiv(std::abs(ineq[pos + offset]));
+ // Upper bound is exclusive.
+ ubExprs.push_back(expr + 1);
+ }
+
+ // Equalities. It's both a lower and a upper bound.
+ SmallVector<int64_t, 4> b;
+ for (auto idx : eqIndices) {
+ auto eq = getEquality(idx);
+ addCoeffs(eq, b);
+ if (eq[pos + offset] > 0)
+ std::transform(b.begin(), b.end(), b.begin(), std::negate<int64_t>());
+
+ // Extract the upper bound (in terms of other coeff's + const).
+ auto expr =
+ getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
+ expr = expr.floorDiv(std::abs(eq[pos + offset]));
// Upper bound is exclusive.
- exprs.push_back(expr + 1);
+ ubExprs.push_back(expr + 1);
+ // Lower bound.
+ expr =
+ getAffineExprFromFlatForm(b, dimCount, symCount, localExprs, context);
+ expr = expr.ceilDiv(std::abs(eq[pos + offset]));
+ lbExprs.push_back(expr);
}
- auto ubMap =
- exprs.empty() ? AffineMap() : AffineMap::get(dimCount, symCount, exprs);
+
+ auto lbMap = lbExprs.empty() ? AffineMap()
+ : AffineMap::get(dimCount, symCount, lbExprs);
+
+ auto ubMap = ubExprs.empty() ? AffineMap()
+ : AffineMap::get(dimCount, symCount, ubExprs);
return {lbMap, ubMap};
}
tmpClone->removeRedundantInequalities();
}
std::tie(lbMap, ubMap) = tmpClone->getLowerAndUpperBound(
- pos, offset, num, getNumDimIds(), {}, context);
+ pos, offset, num, getNumDimIds(), /*localExprs=*/{}, context);
}
// If the above fails, we'll just use the constant lower bound and the
}
} // namespace
+// Returns constraints that are common to both A & B.
+static void getCommonConstraints(const FlatAffineConstraints &A,
+ const FlatAffineConstraints &B,
+ FlatAffineConstraints &C) {
+ C.reset(A.getNumDimIds(), A.getNumSymbolIds(), A.getNumLocalIds());
+ // A naive O(n^2) check should be enough here given the input sizes.
+ for (unsigned r = 0, e = A.getNumInequalities(); r < e; ++r) {
+ for (unsigned s = 0, f = B.getNumInequalities(); s < f; ++s) {
+ if (A.getInequality(r) == B.getInequality(s)) {
+ C.addInequality(A.getInequality(r));
+ break;
+ }
+ }
+ }
+ for (unsigned r = 0, e = A.getNumEqualities(); r < e; ++r) {
+ for (unsigned s = 0, f = B.getNumEqualities(); s < f; ++s) {
+ if (A.getEquality(r) == B.getEquality(s)) {
+ C.addEquality(A.getEquality(r));
+ break;
+ }
+ }
+ }
+}
+
// Computes the bounding box with respect to 'other' by finding the min of the
// lower bounds and the max of the upper bounds along each of the dimensions.
LogicalResult
assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here");
assert(getNumLocalIds() == 0 && "local ids not supported yet here");
+ // Align `other` to this.
Optional<FlatAffineConstraints> otherCopy;
if (!areIdsAligned(*this, otherCst)) {
otherCopy.emplace(FlatAffineConstraints(otherCst));
mergeAndAlignIds(/*offset=*/numDims, this, &otherCopy.getValue());
}
- const auto &other = otherCopy ? *otherCopy : otherCst;
+ const auto &otherAligned = otherCopy ? *otherCopy : otherCst;
+
+ // Get the constraints common to both systems; these will be added as is to
+ // the union.
+ FlatAffineConstraints commonCst;
+ getCommonConstraints(*this, otherAligned, commonCst);
std::vector<SmallVector<int64_t, 8>> boundingLbs;
std::vector<SmallVector<int64_t, 8>> boundingUbs;
// TODO(bondhugula): handle union if a dimension is unbounded.
return failure();
- auto otherExtent = other.getConstantBoundOnDimSize(
+ auto otherExtent = otherAligned.getConstantBoundOnDimSize(
d, &otherLb, &otherLbFloorDivisor, &otherUb);
if (!otherExtent.hasValue() || lbFloorDivisor != otherLbFloorDivisor)
// TODO(bondhugula): symbolic extents when necessary.
} else {
// Uncomparable - check for constant lower/upper bounds.
auto constLb = getConstantLowerBound(d);
- auto constOtherLb = other.getConstantLowerBound(d);
+ auto constOtherLb = otherAligned.getConstantLowerBound(d);
if (!constLb.hasValue() || !constOtherLb.hasValue())
return failure();
std::fill(minLb.begin(), minLb.end(), 0);
} else {
// Uncomparable - check for constant lower/upper bounds.
auto constUb = getConstantUpperBound(d);
- auto constOtherUb = other.getConstantUpperBound(d);
+ auto constOtherUb = otherAligned.getConstantUpperBound(d);
if (!constUb.hasValue() || !constOtherUb.hasValue())
return failure();
std::fill(maxUb.begin(), maxUb.end(), 0);
addInequality(boundingLbs[d]);
addInequality(boundingUbs[d]);
}
+
+ // Add the constraints that were common to both systems.
+ append(commonCst);
+ removeTrivialRedundancy();
+
// TODO(mlir-team): copy over pure symbolic constraints from this and 'other'
// over to the union (since the above are just the union along dimensions); we
// shouldn't be discarding any other constraints on the symbols.
assert(cst->containsId(value) && "value expected to be present");
if (isValidSymbol(value)) {
// Check if the symbol is a constant.
-
if (auto cOp = dyn_cast_or_null<ConstantIndexOp>(value.getDefiningOp()))
cst->setIdToConstant(value, cOp.getValue());
} else if (auto loop = getForInductionVarOwner(value)) {
assert(rank == cst.getNumDimIds() && "inconsistent memref region");
+ // Use a copy of the region constraints that has upper/lower bounds for each
+ // memref dimension with static size added to guard against potential
+ // over-approximation from projection or union bounding box. We may not add
+ // this on the region itself since they might just be redundant constraints
+ // that will need non-trivials means to eliminate.
+ FlatAffineConstraints cstWithShapeBounds(cst);
+ for (unsigned r = 0; r < rank; r++) {
+ cstWithShapeBounds.addConstantLowerBound(r, 0);
+ int64_t dimSize = memRefType.getDimSize(r);
+ if (ShapedType::isDynamic(dimSize))
+ continue;
+ cstWithShapeBounds.addConstantUpperBound(r, dimSize - 1);
+ }
+
// Find a constant upper bound on the extent of this memref region along each
// dimension.
int64_t numElements = 1;
int64_t lbDivisor;
for (unsigned d = 0; d < rank; d++) {
SmallVector<int64_t, 4> lb;
- Optional<int64_t> diff = cst.getConstantBoundOnDimSize(d, &lb, &lbDivisor);
+ Optional<int64_t> diff =
+ cstWithShapeBounds.getConstantBoundOnDimSize(d, &lb, &lbDivisor);
if (diff.hasValue()) {
diffConstant = diff.getValue();
assert(lbDivisor > 0);
return None;
diffConstant = dimSize;
// Lower bound becomes 0.
- lb.resize(cst.getNumSymbolIds() + 1, 0);
+ lb.resize(cstWithShapeBounds.getNumSymbolIds() + 1, 0);
lbDivisor = 1;
}
numElements *= diffConstant;
return numElements;
}
+void MemRefRegion::getLowerAndUpperBound(unsigned pos, AffineMap &lbMap,
+ AffineMap &ubMap) const {
+ assert(pos < cst.getNumDimIds() && "invalid position");
+ auto memRefType = memref.getType().cast<MemRefType>();
+ unsigned rank = memRefType.getRank();
+
+ assert(rank == cst.getNumDimIds() && "inconsistent memref region");
+
+ auto boundPairs = cst.getLowerAndUpperBound(
+ pos, /*offset=*/0, /*num=*/rank, cst.getNumDimAndSymbolIds(),
+ /*localExprs=*/{}, memRefType.getContext());
+ lbMap = boundPairs.first;
+ ubMap = boundPairs.second;
+ assert(lbMap && "lower bound for a region must exist");
+ assert(ubMap && "upper bound for a region must exist");
+ assert(lbMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank);
+ assert(ubMap.getNumInputs() == cst.getNumDimAndSymbolIds() - rank);
+}
+
LogicalResult MemRefRegion::unionBoundingBox(const MemRefRegion &other) {
assert(memref == other.memref);
return cst.unionBoundingBox(*other.getConstraints());
cst.addConstantUpperBound(r, dimSize - 1);
}
}
+ cst.removeTrivialRedundancy();
LLVM_DEBUG(llvm::dbgs() << "Memory region:\n");
LLVM_DEBUG(cst.dump());
#include "mlir/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/MapVector.h"
runOnBlock(&block, copyNests);
// Promote any single iteration loops in the copy nests.
- for (auto nest : copyNests) {
+ for (auto nest : copyNests)
nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
+
+ // Promoting single iteration loops could lead to simplification of
+ // load's/store's. We will run canonicalization patterns on load/stores.
+ // TODO: this whole function load/store canonicalization should be replaced by
+ // canonicalization that is limited to only the load/store ops
+ // introduced/touched by this pass (those inside 'copyNests'). This would be
+ // possible once the necessary support is available in the pattern rewriter.
+ if (!copyNests.empty()) {
+ OwningRewritePatternList patterns;
+ AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
+ AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
+ applyPatternsGreedily(f, std::move(patterns));
}
}
}
/// Generates a point-wise copy from/to `memref' to/from `fastMemRef' and
-/// returns the outermost AffineForOp of the copy loop nest. `memIndicesStart'
-/// holds the lower coordinates of the region in the original memref to copy
-/// in/out. If `copyOut' is true, generates a copy-out; otherwise a copy-in.
-static AffineForOp generatePointWiseCopy(Location loc, Value memref,
- Value fastMemRef,
- AffineMap memAffineMap,
- ArrayRef<Value> memIndicesStart,
- ArrayRef<int64_t> fastBufferShape,
- bool isCopyOut, OpBuilder b) {
- assert(!memIndicesStart.empty() && "only 1-d or more memrefs");
-
- // The copy-in nest is generated as follows as an example for a 2-d region:
- // for x = ...
- // for y = ...
- // fast_buf[x][y] = buf[mem_x + x][mem_y + y]
-
- SmallVector<Value, 4> fastBufIndices, memIndices;
+/// returns the outermost AffineForOp of the copy loop nest. `lbMaps` and
+/// `ubMaps` along with `lbOperands` and `ubOperands` hold the lower and upper
+/// bound information for the copy loop nest. `fastBufOffsets` contain the
+/// expressions to be subtracted out from the respective copy loop iterators in
+/// order to index the fast buffer. If `copyOut' is true, generates a copy-out;
+/// otherwise a copy-in. Builder `b` should be set to the point the copy nest is
+/// inserted.
+//
+/// The copy-in nest is generated as follows as an example for a 2-d region:
+/// for x = ...
+/// for y = ...
+/// fast_buf[x - offset_x][y - offset_y] = memref[x][y]
+///
+static AffineForOp
+generatePointWiseCopy(Location loc, Value memref, Value fastMemRef,
+ ArrayRef<AffineMap> lbMaps, ArrayRef<Value> lbOperands,
+ ArrayRef<AffineMap> ubMaps, ArrayRef<Value> ubOperands,
+ ArrayRef<AffineExpr> fastBufOffsets, bool isCopyOut,
+ OpBuilder b) {
+ assert(llvm::all_of(lbMaps, [&](AffineMap lbMap) {
+ return lbMap.getNumInputs() == lbOperands.size();
+ }));
+ assert(llvm::all_of(ubMaps, [&](AffineMap ubMap) {
+ return ubMap.getNumInputs() == ubOperands.size();
+ }));
+
+ unsigned rank = memref.getType().cast<MemRefType>().getRank();
+ assert(lbMaps.size() == rank && "wrong number of lb maps");
+ assert(ubMaps.size() == rank && "wrong number of ub maps");
+
+ SmallVector<Value, 4> memIndices;
+ SmallVector<AffineExpr, 4> fastBufExprs;
+ SmallVector<Value, 4> fastBufMapOperands;
AffineForOp copyNestRoot;
- for (unsigned d = 0, e = fastBufferShape.size(); d < e; ++d) {
- auto forOp = b.create<AffineForOp>(loc, 0, fastBufferShape[d]);
+ for (unsigned d = 0; d < rank; ++d) {
+ auto forOp = createCanonicalizedAffineForOp(b, loc, lbOperands, lbMaps[d],
+ ubOperands, ubMaps[d]);
if (d == 0)
copyNestRoot = forOp;
+
b = forOp.getBodyBuilder();
- fastBufIndices.push_back(forOp.getInductionVar());
-
- Value memBase =
- (memAffineMap == b.getMultiDimIdentityMap(memAffineMap.getNumDims()))
- ? memIndicesStart[d]
- : b.create<AffineApplyOp>(
- loc,
- AffineMap::get(memAffineMap.getNumDims(),
- memAffineMap.getNumSymbols(),
- memAffineMap.getResult(d)),
- memIndicesStart);
-
- // Construct the subscript for the slow memref being copied.
- auto memIndex = b.create<AffineApplyOp>(
- loc,
- AffineMap::get(2, 0, b.getAffineDimExpr(0) + b.getAffineDimExpr(1)),
- ValueRange({memBase, forOp.getInductionVar()}));
- memIndices.push_back(memIndex);
+
+ auto fastBufOffsetMap =
+ AffineMap::get(lbOperands.size(), 0, {fastBufOffsets[d]});
+ auto offset = b.create<AffineApplyOp>(loc, fastBufOffsetMap, lbOperands);
+
+ // Construct the subscript for the fast memref being copied into/from:
+ // x - offset_x.
+ fastBufExprs.push_back(b.getAffineDimExpr(2 * d + 1) -
+ b.getAffineDimExpr(2 * d));
+ fastBufMapOperands.push_back(offset);
+ fastBufMapOperands.push_back(forOp.getInductionVar());
+
+ // Subscript for the slow memref being copied.
+ memIndices.push_back(forOp.getInductionVar());
}
+ auto fastBufMap = AffineMap::get(2 * rank, /*symbolCount=*/0, fastBufExprs);
+ fullyComposeAffineMapAndOperands(&fastBufMap, &fastBufMapOperands);
+ fastBufMap = simplifyAffineMap(fastBufMap);
+ canonicalizeMapAndOperands(&fastBufMap, &fastBufMapOperands);
+
if (!isCopyOut) {
// Copy in.
auto load = b.create<AffineLoadOp>(loc, memref, memIndices);
- b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufIndices);
+ b.create<AffineStoreOp>(loc, load, fastMemRef, fastBufMap,
+ fastBufMapOperands);
return copyNestRoot;
}
// Copy out.
- auto load = b.create<AffineLoadOp>(loc, fastMemRef, fastBufIndices);
+ auto load =
+ b.create<AffineLoadOp>(loc, fastMemRef, fastBufMap, fastBufMapOperands);
b.create<AffineStoreOp>(loc, load, memref, memIndices);
return copyNestRoot;
}
return success();
}
+ SmallVector<AffineMap, 4> lbMaps(rank), ubMaps(rank);
+ for (unsigned i = 0; i < rank; ++i)
+ region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
+
const FlatAffineConstraints *cst = region.getConstraints();
// 'regionSymbols' hold values that this memory region is symbolic/parametric
// on; these typically include loop IVs surrounding the level at which the
// along the corresponding dimension.
// Index start offsets for faster memory buffer relative to the original.
- SmallVector<AffineExpr, 4> offsets;
- offsets.reserve(rank);
+ SmallVector<AffineExpr, 4> fastBufOffsets;
+ fastBufOffsets.reserve(rank);
for (unsigned d = 0; d < rank; d++) {
assert(lbs[d].size() == cst->getNumCols() - rank && "incorrect bound size");
AffineExpr offset = top.getAffineConstantExpr(0);
- for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++) {
+ for (unsigned j = 0, e = cst->getNumCols() - rank - 1; j < e; j++)
offset = offset + lbs[d][j] * top.getAffineDimExpr(j);
- }
assert(lbDivisors[d] > 0);
offset =
(offset + lbs[d][cst->getNumCols() - 1 - rank]).floorDiv(lbDivisors[d]);
// Record the offsets since they are needed to remap the memory accesses of
// the original memref further below.
- offsets.push_back(offset);
+ fastBufOffsets.push_back(offset);
}
// The faster memory space buffer.
if (!copyOptions.generateDma) {
// Point-wise copy generation.
- auto copyNest = generatePointWiseCopy(loc, memref, fastMemRef, memAffineMap,
- memIndices, fastBufferShape,
- /*isCopyOut=*/region.isWrite(), b);
+ auto copyNest =
+ generatePointWiseCopy(loc, memref, fastMemRef, lbMaps,
+ /*lbOperands=*/regionSymbols, ubMaps,
+ /*ubOperands=*/regionSymbols, fastBufOffsets,
+ /*isCopyOut=*/region.isWrite(), b);
// Record this so that we can skip it from yet another copy.
copyNests.insert(copyNest);
// which the memref region is parametric); then those corresponding to
// the memref's original indices follow.
auto dimExpr = b.getAffineDimExpr(regionSymbols.size() + i);
- remapExprs.push_back(dimExpr - offsets[i]);
+ remapExprs.push_back(dimExpr - fastBufOffsets[i]);
}
auto indexRemap = AffineMap::get(regionSymbols.size() + rank, 0, remapExprs);
// Compute the MemRefRegion accessed.
auto region = std::make_unique<MemRefRegion>(opInst->getLoc());
- if (failed(region->compute(opInst, copyDepth))) {
+ if (failed(region->compute(opInst, copyDepth, /*sliceState=*/nullptr,
+ /*addMemRefDimBounds=*/false))) {
LLVM_DEBUG(llvm::dbgs()
<< "Error obtaining memory region: semi-affine maps?\n");
LLVM_DEBUG(llvm::dbgs() << "over-approximating to the entire memref\n");
if (totalCopyBuffersSizeInBytes > copyOptions.fastMemCapacityBytes) {
StringRef str = "Total size of all copy buffers' for this block "
"exceeds fast memory capacity\n";
- block->getParentOp()->emitError(str);
+ block->getParentOp()->emitWarning(str);
}
return totalCopyBuffersSizeInBytes;
#id = affine_map<(d0) -> (d0)>
#ub = affine_map<(d0) -> (d0 + 128)>
-// Map used to index the original memref while copying.
-// CHECK-DAG: [[MEM_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)>
// Map used to index the buffer while computing.
// CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>
// CHECK-DAG: [[MAP_PLUS_128:map[0-9]+]] = affine_map<(d0) -> (d0 + 128)>
-// CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)>
// CHECK-LABEL: func @matmul
// FILTER-LABEL: func @matmul
// Buffers of size 128x128 get created here for all three matrices.
-// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 {
-// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 {
+// CHECK: affine.for %[[I:.*]] = 0 to 4096 step 128 {
+// CHECK: affine.for %[[J:.*]] = 0 to 4096 step 128 {
// CHECK: [[BUFC:%[0-9]+]] = alloc() : memref<128x128xf32>
-
// The result matrix's copy gets hoisted out.
// Result matrix copy-in.
-// CHECK: affine.for %{{.*}} = 0 to 128 {
-// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK: affine.for %{{.*}} = 0 to 128 {
-// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK: affine.for %[[II:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK: affine.for %[[JJ:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
-// CHECK: affine.store %{{.*}}, [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK: affine.store %{{.*}}, [[BUFC]][-%[[I]] + %[[II]], -%[[J]] + %[[JJ]]] : memref<128x128xf32>
// CHECK: }
// CHECK: }
// LHS matrix copy-in.
-// CHECK: affine.for %{{.*}} = 0 to 4096 step 128 {
+// CHECK: affine.for %[[K:.*]] = 0 to 4096 step 128 {
// CHECK: [[BUFA:%[0-9]+]] = alloc() : memref<128x128xf32>
-// CHECK: affine.for %{{.*}} = 0 to 128 {
-// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK: affine.for %{{.*}} = 0 to 128 {
-// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK: affine.for %[[II:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK: affine.for %[[KK:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
-// CHECK: affine.store %{{.*}}, [[BUFA]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK: affine.store %{{.*}}, [[BUFA]][-%[[I]] + %[[II]], -%[[K]] + %[[KK]]] : memref<128x128xf32>
// CHECK: }
// CHECK: }
// RHS matrix copy-in.
// CHECK: [[BUFB:%[0-9]+]] = alloc() : memref<128x128xf32>
-// CHECK: affine.for %{{.*}} = 0 to 128 {
-// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK: affine.for %{{.*}} = 0 to 128 {
-// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
+// CHECK: affine.for %[[KK:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK: affine.for %[[JJ:.*]] = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
// CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
-// CHECK: affine.store %{{.*}}, [[BUFB]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
+// CHECK: affine.store %{{.*}}, [[BUFB]][-%[[K]] + %[[KK]], -%[[J]] + %[[JJ]]] : memref<128x128xf32>
// CHECK: }
// CHECK: }
// CHECK: dealloc [[BUFB]] : memref<128x128xf32>
// CHECK: dealloc [[BUFA]] : memref<128x128xf32>
// CHECK: }
-// CHECK: affine.apply #map{{.*}}(%{{.*}}, %{{.*}})
-// CHECK: affine.apply #map{{.*}}(%{{.*}}, %{{.*}})
// Result matrix copy out.
-// CHECK: affine.for %{{.*}} = 0 to 128 {
-// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK: affine.for %{{.*}} = 0 to 128 {
-// CHECK: affine.apply #[[MEM_IDX_MAP]](%{{.*}}, %{{.*}})
-// CHECK: [[BUFA]] = affine.load [[BUFC]][%{{.*}}, %{{.*}}] : memref<128x128xf32>
-// CHECK: store [[BUFA]], %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
+// CHECK: affine.for %{{.*}} = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK: affine.for %{{.*}} = #[[MAP_IDENTITY]](%{{.*}}) to #[[MAP_PLUS_128]](%{{.*}}) {
+// CHECK: affine.load [[BUFC]][-%{{.*}} + %{{.*}}, -%{{.*}} + %{{.*}}] : memref<128x128xf32>
+// CHECK: store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<4096x4096xf32>
// CHECK: }
// CHECK: }
// CHECK: dealloc [[BUFC]] : memref<128x128xf32>
// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
// FILTER: alloc() : memref<128x4096xf32>
// FILTER-NOT: alloc()
-// FILTER: affine.for %{{.*}} = 0 to 128 {
+// FILTER: affine.for
// FILTER: affine.for %{{.*}} = 0 to 4096 {
// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 {
// FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 {
// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) {
-// FILTER: dealloc %1 : memref<128x4096xf32>
-// FILTER-NOT: dealloc %1 : memref<128x4096xf32>
+// FILTER: dealloc %{{.*}} : memref<128x4096xf32>
+// FILTER-NOT: dealloc %{{.*}} : memref<128x4096xf32>
// -----
// This test case will lead to single element buffers. These are eventually
// expected to be turned into registers via alloca and mem2reg.
//
-// CHECK-SMALL-LABEL: func @foo
-// FILTER-LABEL: func @foo
-// MEMREF_REGION-LABEL: func @foo
-func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
+// CHECK-SMALL-LABEL: func @single_elt_buffers
+// FILTER-LABEL: func @single_elt_buffers
+// MEMREF_REGION-LABEL: func @single_elt_buffers
+func @single_elt_buffers(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
affine.for %i = 0 to 1024 {
affine.for %j = 0 to 1024 {
affine.for %k = 0 to 1024 {
}
// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
// CHECK-SMALL: alloc() : memref<1x1xf32>
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
// CHECK-SMALL: affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
-// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: affine.for %arg{{.*}} = 0 to 1024 {
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
// CHECK-SMALL: alloc() : memref<1x1xf32>
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
// CHECK-SMALL: affine.load %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
-// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: addf %{{.*}}, %{{.*}} : f32
// CHECK-SMALL: affine.store %{{.*}}, %{{.*}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: dealloc %{{.*}} : memref<1x1xf32>
// CHECK-SMALL: }
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %arg{{.*}})
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL: affine.apply #map{{.*}}(%arg{{.*}}, %c0{{.*}})
-// CHECK-SMALL: affine.load %{{.*}}[%c0{{.*}}, %c0{{.*}}] : memref<1x1xf32>
+// CHECK-SMALL: affine.load %{{.*}}[0, 0] : memref<1x1xf32>
// CHECK-SMALL: affine.store %{{.*}}, %arg{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x1024xf32>
// CHECK-SMALL: dealloc %{{.*}} : memref<1x1xf32>
// CHECK-SMALL: }
// MEMREF_REGION-NOT: alloc()
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
-// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION: }
+// MEMREF_REGION: }
+// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 {
// MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
// MEMREF_REGION-NOT: dealloc
+// MEMREF_REGION-NEXT: return
+
+// -----
+
+// This pattern typically appears with tiling with tile sizes that don't divide
+// the loop trip counts.
+
+#map_ub = affine_map<(d0) -> (4096, d0 + 100)>
+
+// CHECK-DAG: [[MAP_IDENTITY:map[0-9]+]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: [[MAP_MIN_UB1:map[0-9]+]] = affine_map<(d0) -> (d0 + 100, 4096)>
+// CHECK-DAG: [[MAP_MIN_UB2:map[0-9]+]] = affine_map<(d0) -> (4096, d0 + 100)>
+
+// CHECK-LABEL: func @min_upper_bound
+func @min_upper_bound(%A: memref<4096xf32>) -> memref<4096xf32> {
+ affine.for %i = 0 to 4096 step 100 {
+ affine.for %ii = affine_map<(d0) -> (d0)>(%i) to min #map_ub(%i) {
+ %5 = affine.load %A[%ii] : memref<4096xf32>
+ %6 = mulf %5, %5 : f32
+ affine.store %6, %A[%ii] : memref<4096xf32>
+ }
+ }
+ return %A : memref<4096xf32>
+}
+// CHECK: affine.for %[[IV1:.*]] = 0 to 4096 step 100
+// CHECK-NEXT: %[[BUF:.*]] = alloc() : memref<100xf32>
+// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) {
+// CHECK-NEXT: affine.load %{{.*}}[%[[IV2]]] : memref<4096xf32>
+// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB2]](%[[IV1]]) {
+// CHECK-NEXT: affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
+// CHECK-NEXT: mulf
+// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: affine.for %[[IV2:.*]] = #[[MAP_IDENTITY]](%[[IV1]]) to min #[[MAP_MIN_UB1]](%[[IV1]]) {
+// CHECK-NEXT: affine.load %[[BUF]][-%[[IV1]] + %[[IV2]]] : memref<100xf32>
+// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%[[IV2]]] : memref<4096xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: dealloc %[[BUF]] : memref<100xf32>
+// CHECK-NEXT: }
+
+// -----
+
+// Lower bound is a max; upper bound is a min. This pattern typically appears
+// with multi-level tiling when the tile sizes used don't divide loop trip
+// counts.
+
+#lb = affine_map<(d0, d1) -> (d0 * 512, d1 * 6)>
+#ub = affine_map<(d0, d1) -> (d0 * 512 + 512, d1 * 6 + 6)>
+
+// CHECK-DAG: #[[LB:.*]] = affine_map<()[s0, s1] -> (s0 * 512, s1 * 6)>
+// CHECK-DAG: #[[UB:.*]] = affine_map<()[s0, s1] -> (s0 * 512 + 512, s1 * 6 + 6)>
+
+// CHECK-LABEL: max_lower_bound(%{{.*}}: memref<2048x516xf64>,
+// CHECK-SAME: [[i:arg[0-9]+]]
+// CHECK-SAME: [[j:arg[0-9]+]]
+func @max_lower_bound(%M: memref<2048x516xf64>, %i : index, %j : index) {
+ affine.for %ii = 0 to 2048 {
+ affine.for %jj = max #lb(%i, %j) to min #ub(%i, %j) {
+ affine.load %M[%ii, %jj] : memref<2048x516xf64>
+ }
+ }
+ return
+}
+
+// CHECK: %[[BUF=.*]] = alloc() : memref<2048x6xf64>
+// CHECK-NEXT: affine.for %[[ii:.*]] = 0 to 2048 {
+// CHECK-NEXT: affine.for %[[jj:.*]] = max #[[LB]]()[%[[i]], %[[j]]] to min #[[UB]]()[%[[i]], %[[j]]] {
+// CHECK-NEXT: affine.load %{{.*}}[%[[ii]], %[[jj]]] : memref<2048x516xf64>
+// CHECK-NEXT: affine.store %{{.*}}, %[[BUF]][%[[ii]], %[[jj]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: affine.for %[[ii_:.*]] = 0 to 2048 {
+// CHECK-NEXT: affine.for %[[jj_:.*]] = max #[[LB]]()[%{{.*}}, %{{.*}}] to min #[[UB]]()[%{{.*}}, %{{.*}}] {
+// CHECK-NEXT: affine.load %[[BUF]][%[[ii_]], %[[jj_]] - symbol(%[[j]]) * 6] : memref<2048x6xf64>
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: dealloc %[[BUF]] : memref<2048x6xf64>
// -----
// Index of the buffer for the second DMA is remapped.
-// CHECK-DAG: [[MAP_PLUS_256:#map[0-9]+]] = affine_map<(d0) -> (d0 + 256)>
// CHECK-DAG: [[MAP0:#map[0-9]+]] = affine_map<(d0) -> (d0)>
// CHECK-LABEL: func @loop_nest_1d() {
// Second DMA transfer.
// CHECK: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<512xf32>, memref<256xf32, 2>, memref<1xi32>
// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
- // CHECK: affine.for %{{.*}} = 0 to 256 {
+ // CHECK: affine.for %[[IV:.*]] = 0 to 256 {
// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
- // CHECK: affine.apply [[MAP_PLUS_256]](%{{.*}})
- // Buffer for '%{{.*}}' in faster memref space is smaller size: 256xf32
- // Affine map for 'affine.load %{{.*}}' is composed: %{{.*}} + 256 - 256 = %{{.*}}.
- // CHECK-NEXT: %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
+ // Buffer for '%{{.*}}' in faster memref space is of smaller size: 256xf32
+ // Affine map for load on B is composed and becomes identity.
+ // CHECK: affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2>
// Already in faster memory space.
- // CHECK: affine.load %{{.*}}[%{{.*}}] : memref<256xf32, 2>
+ // CHECK: affine.load %{{.*}}[%[[IV]]] : memref<256xf32, 2>
// CHECK-NEXT: }
// CHECK-NEXT: dealloc %{{.*}} : memref<1xi32>
// CHECK-NEXT: dealloc %{{.*}} : memref<256xf32, 2>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
-// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
-// CHECK-NEXT: %{{.*}} = affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK: affine.load [[BUFB]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
// CHECK-NEXT: "foo"(%{{.*}}) : (f32) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
-// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
-// CHECK-NEXT: affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK: affine.load [[BUFA]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
// CHECK-NEXT: "bar"(%{{.*}}) : (f32) -> ()
// CHECK-NEXT: }
// CHECK-NEXT: affine.for %{{.*}} = 0 to 16 {
// CHECK-NEXT: "abc_compute"() : () -> f32
-// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.*}}, %{{.*}})
-// CHECK-NEXT: affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
+// CHECK: affine.load [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
// CHECK-NEXT: "addf32"(%{{.*}}, %{{.*}}) : (f32, f32) -> f32
// CHECK-NEXT: affine.store %{{.*}}, [[BUFC]][%{{.*}} * 16 + %{{.*}}, %{{.*}}] : memref<512x32xf32, 2>
// CHECK-NEXT: }
// CHECK-LABEL: func @loop_nest_modulo() {
// CHECK: alloc() : memref<256x8xf32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 32 step 4 {
-// CHECK-NEXT: affine.apply #map{{[0-9]+}}(%{{.*}})
-// CHECK-NEXT: alloc() : memref<1x2xf32, 2>
+// CHECK: alloc() : memref<1x2xf32, 2>
// CHECK-NEXT: alloc() : memref<1xi32>
// Composition of the affine map for '%{{.*}}' causes '%{{.*}}' to be added as a symbol.
// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}, 0], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<256x8xf32>, memref<1x2xf32, 2>, memref<1xi32>
// -----
-// CHECK-DAG: [[MAP_SYM_SHIFT:#map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d1 + s0 + s1)>
-
// CHECK-LABEL: func @dma_with_symbolic_accesses
func @dma_with_symbolic_accesses(%A : memref<100x100xf32>, %M : index) {
%N = constant 9 : index
// CHECK-NEXT: alloc() : memref<1xi32>
// CHECK-NEXT: affine.dma_start %{{.*}}[0, symbol(%{{.*}}) + 9], %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}
// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}}
-// CHECK-NEXT: affine.for %{{.*}} = 0 to 100 {
-// CHECK-NEXT: affine.for %{{.*}} = 0 to 100 {
-// CHECK-NEXT: affine.apply [[MAP_SYM_SHIFT]](%{{.*}}, %{{.*}})[%{{.*}}, %{{.*}}]
-// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<100x100xf32, 2>
+// CHECK-NEXT: affine.for %[[IV0:.*]] = 0 to 100 {
+// CHECK-NEXT: affine.for %[[IV1:.*]] = 0 to 100 {
+// CHECK: affine.load %{{.*}}[%[[IV0]], %[[IV1]]] : memref<100x100xf32, 2>
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK: return
// -----
-// CHECK-DAG: [[MAP_PLUS_64:#map[0-9]+]] = affine_map<(d0) -> (d0 + 64)>
-// CHECK-DAG: [[MAP_PLUS_128:#map[0-9]+]] = affine_map<(d0) -> (d0 + 128)>
-// CHECK-DAG: [[MAP_PLUS_2:#map[0-9]+]] = affine_map<(d0) -> (d0 + 2)>
-// CHECK-DAG: [[MAP_PLUS_192:#map[0-9]+]] = affine_map<(d0) -> (d0 + 192)>
-
// The first load accesses ([2,258), [128,384))
// The second load accesses ([64,320), [2,258))
// The first store writes to ([2,258), [192,448))
// CHECK-NEXT: alloc() : memref<1xi32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 256 {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 256 {
-// CHECK-NEXT: affine.apply [[MAP_PLUS_64]](%{{.*}})
-// CHECK-NEXT: affine.apply [[MAP_PLUS_128]](%{{.*}})
-// CHECK-NEXT: affine.apply [[MAP_PLUS_2]](%{{.*}})
-// CHECK-NEXT: affine.apply [[MAP_PLUS_2]](%{{.*}})
-// CHECK-NEXT: affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2>
+// CHECK: affine.load %{{.*}}[%{{.*}}, %{{.*}} + 126] : memref<382x446xf32, 2>
// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} + 62, %{{.*}}] : memref<382x446xf32, 2>
-// CHECK-NEXT: affine.apply [[MAP_PLUS_128]](%{{.*}})
-// CHECK-NEXT: affine.apply [[MAP_PLUS_192]](%{{.*}})
-// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2>
+// CHECK: affine.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}} + 190] : memref<382x446xf32, 2>
// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}} + 126, %{{.*}}] : memref<382x446xf32, 2>
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK: [[BUF:%[0-9]+]] = alloc() : memref<1027xf32, 2>
// CHECK-NEXT: [[MEM:%[0-9]+]] = alloc() : memref<1xi32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 1024 {
-// CHECK-NEXT: affine.for %{{.*}} = {{#map[0-9]+}}(%{{.*}}) to {{#map[0-9]+}}(%{{.*}}) {
-// CHECK-NEXT: constant 0.000000e+00 : f32
-// CHECK-NEXT: affine.store %{{.*}}, [[BUF]][%{{.*}}] : memref<1027xf32, 2>
+// CHECK-NEXT: affine.for %[[I2:.*]] = {{#map[0-9]+}}(%{{.*}}) to {{#map[0-9]+}}(%{{.*}}) {
+// CHECK: affine.store %{{.*}}, [[BUF]][%[[I2]]] : memref<1027xf32, 2>
// CHECK-NEXT: }
// CHECK-NEXT: }
// CHECK-NEXT: affine.dma_start [[BUF]][%{{.*}}], %{{.*}}[%{{.*}}], [[MEM]][%{{.*}}], %{{.*}} : memref<1027xf32, 2>, memref<1027xf32>, memref<1xi32>
// -----
-// CHECK-DAG: [[MAP_READ_OFFSET:#map[0-9]+]] = affine_map<(d0) -> (d0 + 100)>
-// CHECK-DAG: [[MAP_WRITE_OFFSET:#map[0-9]+]] = affine_map<(d0) -> (d0 + 25)>
-
func @test_read_write_region_union() {
%0 = alloc() : memref<256xf32>
affine.for %i0 = 0 to 10 {
// CHECK-NEXT: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32>
// CHECK-NEXT: alloc() : memref<1xi32>
// CHECK-NEXT: affine.for %{{.*}} = 0 to 10 {
-// CHECK-NEXT: affine.apply [[MAP_READ_OFFSET]](%{{.*}})
-// CHECK-NEXT: affine.apply [[MAP_WRITE_OFFSET]](%{{.*}})
-// CHECK-NEXT: affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2>
+// CHECK: affine.load %{{.*}}[%{{.*}} + 75] : memref<85xf32, 2>
// CHECK-NEXT: affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<85xf32, 2>
// CHECK-NEXT: }
// CHECK-NEXT: affine.dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}} : memref<85xf32, 2>, memref<256xf32>, memref<1xi32>
#include "mlir/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/LoopUtils.h"
#include "mlir/Transforms/Passes.h"
/*fastMemorySpace=*/0,
/*tagMemorySpace=*/0,
/*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
+ DenseSet<Operation *> copyNests;
if (clMemRefFilter) {
- DenseSet<Operation *> copyNests;
affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests);
} else if (clTestGenerateCopyForMemRegion) {
CopyGenerateResult result;
region.compute(load, /*loopDepth=*/0);
generateCopyForMemRegion(region, loopNest, copyOptions, result);
}
+
+ // Promote any single iteration loops in the copy nests.
+ for (auto nest : copyNests)
+ nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); });
+
+ // Promoting single iteration loops could lead to simplification
+ // of load's/store's. We will run the canonicalization patterns again.
+ OwningRewritePatternList patterns;
+ AffineLoadOp::getCanonicalizationPatterns(patterns, &getContext());
+ AffineStoreOp::getCanonicalizationPatterns(patterns, &getContext());
+ applyPatternsGreedily(getFunction(), std::move(patterns));
}
namespace mlir {