//===----------------------------------------------------------------------===//
SparseTensorLoopEmitter::SparseTensorLoopEmitter(ValueRange tensors,
- bool isLastOutput)
- : tensors(tensors.begin(), tensors.end()), dims(tensors.size()),
- pidxs(tensors.size()), coord(tensors.size()), highs(tensors.size()),
- sizes(tensors.size()), ptrBuffer(tensors.size()),
- idxBuffer(tensors.size()), valBuffer(tensors.size()),
- isLastOutput(isLastOutput), loopStack(), curLv(tensors.size(), 0) {
- for (size_t i = 0, e = tensors.size(); i < e; i++) {
- auto t = tensors[i];
- auto rtp = t.getType().dyn_cast<RankedTensorType>();
- if (!rtp) // a scalar (0-dimension tensors)
+ bool hasOutput,
+ bool isSparseOut)
+ : hasOutput(hasOutput), tensors(tensors.begin(), tensors.end()),
+ dimTypes(tensors.size()), pidxs(tensors.size()), coord(tensors.size()),
+ highs(tensors.size()), ptrBuffer(tensors.size()),
+ idxBuffer(tensors.size()), valBuffer(tensors.size()), loopStack() {
+ for (size_t tid = 0, e = tensors.size(); tid < e; tid++) {
+ auto t = tensors[tid];
+ // a scalar or 0-dimension tensors
+ if (isZeroRankedTensorOrScalar(t.getType()))
continue;
-
+ auto rtp = t.getType().cast<RankedTensorType>();
auto rank = static_cast<size_t>(rtp.getRank());
auto enc = getSparseTensorEncoding(rtp);
- if (enc)
+ // We always treat sparse output tensor as dense so that we always iterate
+ // it based on dim size.
+ if (enc && !(isOutputTensor(tid) && isSparseOut))
for (auto dimTp : enc.getDimLevelType())
- dims[i].push_back(dimTp);
+ dimTypes[tid].push_back(dimTp);
else
- dims[i].assign(rank, DimLevelType::Dense);
+ dimTypes[tid].assign(rank, DimLevelType::Dense);
// Initialize using empty value.
- pidxs[i].assign(rank, Value());
- coord[i].assign(rank, Value());
- highs[i].assign(rank, Value());
- sizes[i].assign(rank, Value());
- ptrBuffer[i].assign(rank, Value());
- idxBuffer[i].assign(rank, Value());
+ pidxs[tid].assign(rank, Value());
+ coord[tid].assign(rank, Value());
+ highs[tid].assign(rank, Value());
+ ptrBuffer[tid].assign(rank, Value());
+ idxBuffer[tid].assign(rank, Value());
}
}
-void SparseTensorLoopEmitter::initializeLoopEmit(OpBuilder &builder,
- Location loc) {
+void SparseTensorLoopEmitter::initializeLoopEmit(
+ OpBuilder &builder, Location loc,
+ SparseTensorLoopEmitter::OutputUpdater updater) {
// For every tensor, find lower and upper bound on dimensions, set the
// same bounds on loop indices, and obtain dense or sparse buffer(s).
- // TODO: Provides ability to generate loop on output buffer (with undef
- // dim level in Merger in GenericOp Sparsification).
for (size_t t = 0, e = tensors.size(); t < e; t++) {
auto tensor = tensors[t];
- auto rtp = tensor.getType().cast<RankedTensorType>();
+ auto rtp = tensor.getType().dyn_cast<RankedTensorType>();
+ if (!rtp)
+ // Skips only scalar, zero ranked tensor still need to be bufferized and
+ // (probably) filled with zeros by users.
+ continue;
auto rank = rtp.getRank();
auto shape = rtp.getShape();
auto enc = getSparseTensorEncoding(rtp);
// Scan all dimensions of current tensor.
for (int64_t d = 0; d < rank; d++) {
// This should be called only once at beginning.
- assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !sizes[t][d] &&
- !highs[t][d]);
+ assert(!ptrBuffer[t][d] && !idxBuffer[t][d] && !highs[t][d]);
// Handle sparse storage schemes.
- if (isCompressedDLT(dims[t][d])) {
+ if (isCompressedDLT(dimTypes[t][d])) {
auto ptrTp =
MemRefType::get(dynShape, getPointerOverheadType(builder, enc));
auto indTp =
// Generate sparse primitives to obtains pointer and indices.
ptrBuffer[t][d] = builder.create<ToPointersOp>(loc, ptrTp, tensor, dim);
idxBuffer[t][d] = builder.create<ToIndicesOp>(loc, indTp, tensor, dim);
- } else if (isSingletonDLT(dims[t][d])) {
+ } else if (isSingletonDLT(dimTypes[t][d])) {
// Singleton dimension, fetch indices.
auto indTp =
MemRefType::get(dynShape, getIndexOverheadType(builder, enc));
idxBuffer[t][d] = builder.create<ToIndicesOp>(loc, indTp, tensor, dim);
} else {
// Dense dimension, nothing to fetch.
- assert(isDenseDLT(dims[t][d]));
+ assert(isDenseDLT(dimTypes[t][d]));
}
// Find upper bound in current dimension.
unsigned p = toOrigDim(enc, d);
Value up = mlir::linalg::createOrFoldDimOp(builder, loc, tensor, p);
- sizes[t][d] = highs[t][d] = up;
+ highs[t][d] = up;
}
+
// Perform the required bufferization. Dense inputs materialize
- // from the input tensors. Dense outputs need special handling.
- // Sparse inputs use sparse primitives to obtain the values.
+ // from the input tensors. Sparse inputs use sparse primitives to obtain the
+ // values.
+ // Delegates extra output initialization to clients.
+ bool isOutput = isOutputTensor(t);
Type elementType = rtp.getElementType();
-
if (!enc) {
// Non-annotated dense tensors.
auto denseTp = MemRefType::get(shape, elementType);
- if (isLastOutput && t == tensors.size() - 1)
- llvm_unreachable("TODO: not yet handled");
- else
- valBuffer[t] =
- builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
+ Value denseVal =
+ builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
+ // Dense outputs need special handling.
+ if (isOutput && updater)
+ denseVal = updater(builder, loc, denseVal, tensor);
+
+ valBuffer[t] = denseVal;
} else {
// Annotated sparse tensors.
+ // We also need the value buffer for annotated all dense `sparse` tensor.
auto dynShape = {ShapedType::kDynamicSize};
auto sparseTp = MemRefType::get(dynShape, elementType);
valBuffer[t] = builder.create<ToValuesOp>(loc, sparseTp, tensor);
}
- // Prepare to enter the first dim for all (input) tensors
- prepareLoopOverTensorAtDim(builder, loc, t, 0);
+ // NOTE: we can also prepares for 0 dim here in advance, this will hosit
+ // some loop preparation from tensor iteration, but will also (undesirably)
+ // hosit the code ouside if conditions.
}
}
+void SparseTensorLoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc,
+ ArrayRef<size_t> tids,
+ ArrayRef<size_t> dims) {
+ // Universal Index start from 0
+ assert(loopSeqStack.size() == loopStack.size());
+ // Universal index starts from 0
+ loopSeqStack.emplace_back(constantIndex(builder, loc, 0));
+ // Prepares for all the tensors used in the current loop sequence.
+ for (auto [tid, dim] : llvm::zip(tids, dims))
+ prepareLoopOverTensorAtDim(builder, loc, tid, dim);
+}
+
Operation *SparseTensorLoopEmitter::enterLoopOverTensorAtDim(
OpBuilder &builder, Location loc, size_t tid, size_t dim,
- ArrayRef<Value> reduc) {
- assert(dims[tid].size() > dim);
+ MutableArrayRef<Value> reduc, bool isParallel, ArrayRef<size_t> extraTids,
+ ArrayRef<size_t> extraDims) {
+ assert(dimTypes[tid].size() > dim);
// We can not re-enter the same level.
assert(!coord[tid][dim]);
+
Value step = constantIndex(builder, loc, 1);
- auto dimType = dims[tid][dim];
- bool isSparse = isCompressedDLT(dimType) || isSingletonDLT(dimType);
+ auto dimType = dimTypes[tid][dim];
+ bool isSparseInput = isCompressedDLT(dimType) || isSingletonDLT(dimType);
assert(isDenseDLT(dimType) || isCompressedDLT(dimType) ||
isSingletonDLT(dimType));
- Value lo = isSparse ? pidxs[tid][dim] : constantIndex(builder, loc, 0);
+ Value lo = isSparseInput ? pidxs[tid][dim] // current offset
+ : loopSeqStack.back(); // univeral tid
Value hi = highs[tid][dim];
- // TODO: support reduction.
- if (!reduc.empty())
- llvm_unreachable("TODO: not implemented yet");
-
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
builder.setInsertionPointToStart(forOp.getBody());
Value iv = forOp.getInductionVar();
- Operation *loop = forOp;
-
assert(iv);
- if (isSparse) {
+ if (isSparseInput) {
pidxs[tid][dim] = iv;
// Generating a load on the indices array yields the coordinate.
Value ptr = idxBuffer[tid][dim];
- // TODO: generates load for vector value.
coord[tid][dim] = genIndexLoad(builder, loc, ptr, iv);
} else {
// Dense tensor, the coordinates is the inducation variable.
coord[tid][dim] = iv;
// generate pidx for dense dim (pidx = i * sz + j)
- // TODO: handle vector loop.
- Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1];
- Value mul = builder.create<arith::MulIOp>(loc, sizes[tid][dim], p);
- Value add = builder.create<arith::AddIOp>(loc, mul, iv);
- pidxs[tid][dim] = add;
+ auto enc = getSparseTensorEncoding(tensors[tid].getType());
+ if (enc)
+ pidxs[tid][dim] = genAddress(builder, loc, tid, dim, iv);
}
- // Prepares for next dim if this is not currently the innermost dimension.
- if (dim != dims[tid].size() - 1)
- prepareLoopOverTensorAtDim(builder, loc, tid, dim + 1);
+ // NOTE: we can also prepares for next dim here in advance
+ // Push the loop into stack
+ loopStack.emplace_back(ArrayRef<size_t>(tid), ArrayRef<size_t>(dim), forOp,
+ coord[tid][dim]);
+ // Emit extra locals.
+ emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);
+
+ // In-place update on the reduction variable vector.
+ assert(forOp.getNumRegionIterArgs() == reduc.size());
+ for (int i = 0, e = reduc.size(); i < e; i++)
+ reduc[i] = forOp.getRegionIterArg(i);
+ return forOp;
+}
+
+Operation *SparseTensorLoopEmitter::enterCoIterationOverTensorsAtDims(
+ OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+ ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc,
+ ArrayRef<size_t> extraTids, ArrayRef<size_t> extraDims) {
+ assert(tids.size() == dims.size());
+ SmallVector<Type, 4> types;
+ SmallVector<Value, 4> operands;
+ // Construct the while-loop with a parameter for each index.
+ Type indexType = builder.getIndexType();
+ for (auto [tid, dim] : llvm::zip(tids, dims)) {
+ if (isCompressedDLT(dimTypes[tid][dim]) ||
+ isSingletonDLT(dimTypes[tid][dim])) {
+ assert(pidxs[tid][dim]);
+ types.push_back(indexType);
+ operands.push_back(pidxs[tid][dim]);
+ }
+ }
+ // The position where user-supplied reduction variable starts.
+ for (Value rec : reduc) {
+ types.push_back(rec.getType());
+ operands.push_back(rec);
+ }
+ if (needsUniv) {
+ types.push_back(indexType);
+ // Update universal index.
+ operands.push_back(loopSeqStack.back());
+ }
+ assert(types.size() == operands.size());
+ scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
+
+ SmallVector<Location> locs(types.size(), loc);
+ Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
+ Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
+
+ // Build the "before" region, which effectively consists
+ // of a conjunction of "i < upper" tests on all induction.
+ builder.setInsertionPointToStart(&whileOp.getBefore().front());
+ Value cond;
+ unsigned o = 0;
+ for (auto [tid, dim] : llvm::zip(tids, dims)) {
+ if (isCompressedDLT(dimTypes[tid][dim]) ||
+ isSingletonDLT(dimTypes[tid][dim])) {
+ Value op1 = before->getArgument(o);
+ Value op2 = highs[tid][dim];
+ Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
+ op1, op2);
+ cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc;
+ // Update
+ pidxs[tid][dim] = after->getArgument(o++);
+ }
+ }
+ builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
+
+ // Generates while body.
+ builder.setInsertionPointToStart(&whileOp.getAfter().front());
+ Value min;
+ for (auto [tid, dim] : llvm::zip(tids, dims)) {
+ // Prepares for next level.
+ if (isCompressedDLT(dimTypes[tid][dim]) ||
+ isSingletonDLT(dimTypes[tid][dim])) {
+ Value ptr = idxBuffer[tid][dim];
+ Value s = pidxs[tid][dim];
+ Value load = genIndexLoad(builder, loc, ptr, s);
+ coord[tid][dim] = load;
+ if (!needsUniv) {
+ if (min) {
+ Value cmp = builder.create<arith::CmpIOp>(
+ loc, arith::CmpIPredicate::ult, load, min);
+ min = builder.create<arith::SelectOp>(loc, cmp, load, min);
+ } else {
+ min = load;
+ }
+ }
+ }
+ }
- loopStack.push_back(LoopLevelInfo({tid}, {dim}, coord[tid][dim]));
- return loop;
-}
+ if (needsUniv) {
+ assert(!min);
+ // Otherwise, universal index is the minimal pidx.
+ min = after->getArguments().back();
+ }
+
+ for (auto [tid, dim] : llvm::zip(tids, dims)) {
+ // All dense dim (as well as sparse output tensor) shared the same pidx in
+ // the while loop.
+ if (isDenseDLT(dimTypes[tid][dim])) {
+ pidxs[tid][dim] = min;
+ // generate pidx for dense dim (pidx = i * sz + j)
+ auto enc = getSparseTensorEncoding(tensors[tid].getType());
+ if (enc)
+ pidxs[tid][dim] = genAddress(builder, loc, tid, dim, min);
+ }
+ // NOTE: we can also prepares for next dim here in advance
+ }
+ // Sets up the loop stack.
+ loopStack.emplace_back(tids, dims, whileOp, min);
+ assert(loopStack.size() == loopSeqStack.size());
-void SparseTensorLoopEmitter::enterCoiterationOverTensorsAtDims(
- OpBuilder &builder, Location loc, ArrayRef<size_t> ts,
- ArrayRef<size_t> ds) {
- llvm_unreachable("TODO: unimplemented");
+ // Emits extra locals
+ emitExtraLocalsForTensorsAtDenseDims(builder, loc, extraTids, extraDims);
+
+ // Updates reduction variables
+ assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0));
+ // In-place update on reduction variable.
+ for (unsigned i = 0, e = reduc.size(); i < e; i++)
+ reduc[i] = after->getArgument(o + i);
+
+ return whileOp;
}
-bool SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder,
+void SparseTensorLoopEmitter::prepareLoopOverTensorAtDim(OpBuilder &builder,
Location loc,
size_t tid,
size_t dim) {
- // TODO: generate loop iteration on output tensor based on the shape
- // instead of pointer/indices arrays.
- assert(dims[tid].size() > dim);
- auto dimType = dims[tid][dim];
+ assert(dimTypes[tid].size() > dim);
+ auto dimType = dimTypes[tid][dim];
if (isDenseDLT(dimType))
- return false;
+ return;
// Either the first dimension, or the previous dimension has been set.
assert(dim == 0 || pidxs[tid][dim - 1]);
Value ptr = ptrBuffer[tid][dim];
Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
- Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
-
pidxs[tid][dim] = genIndexLoad(builder, loc, ptr, pLo);
+
+ Value pHi = builder.create<arith::AddIOp>(loc, pLo, c1);
highs[tid][dim] = genIndexLoad(builder, loc, ptr, pHi);
- return true;
+ return;
}
if (isSingletonDLT(dimType)) {
Value pLo = dim == 0 ? c0 : pidxs[tid][dim - 1];
pidxs[tid][dim] = pLo;
highs[tid][dim] = pHi;
- return true;
+ return;
}
llvm_unreachable("Unrecognizable dimesion type!");
}
-Value SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDims(
- OpBuilder &builder, Location loc, size_t tid, size_t dim) {
- llvm_unreachable("TODO: not implemented yet");
+void SparseTensorLoopEmitter::emitExtraLocalsForTensorsAtDenseDims(
+ OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+ ArrayRef<size_t> dims) {
+ // Initialize dense positions. Note that we generate dense indices of the
+ // output tensor unconditionally, since they may not appear in the lattice,
+ // but may be needed for linearized codegen.
+ for (auto [tid, dim] : llvm::zip(tids, dims)) {
+ assert(isDenseDLT(dimTypes[tid][dim]));
+ auto enc = getSparseTensorEncoding(tensors[tid].getType());
+ if (enc) {
+ bool validPidx = dim == 0 || pidxs[tid][dim - 1];
+ if (!validPidx) {
+ // We might not find the pidx for the sparse output tensor as it is
+ // unconditionally required by the sparsification.
+ assert(isOutputTensor(tid));
+ continue;
+ }
+ pidxs[tid][dim] = genAddress(builder, loc, tid, dim, loopStack.back().iv);
+ // NOTE: we can also prepares for next dim here in advance
+ }
+ }
}
-void SparseTensorLoopEmitter::exitCurrentLoop() {
- // Clean up the values, it would help use to discover potential bug at a
- // earlier stage (instead of silently using a wrong value).
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitForLoop(OpBuilder &builder, Location loc,
+ ArrayRef<Value> reduc) {
LoopLevelInfo &loopInfo = loopStack.back();
- assert(loopInfo.tensors.size() == loopInfo.dims.size());
- for (auto info : llvm::zip(loopInfo.tensors, loopInfo.dims)) {
- auto tid = std::get<0>(info);
- auto dim = std::get<1>(info);
- assert(pidxs[tid][dim] && coord[tid][dim] && highs[tid][dim]);
+ auto &dims = loopStack.back().dims;
+ auto &tids = loopStack.back().tids;
+ auto forOp = llvm::cast<scf::ForOp>(loopInfo.loop);
+ if (!reduc.empty()) {
+ assert(reduc.size() == forOp.getNumResults());
+ builder.setInsertionPointToEnd(forOp.getBody());
+ builder.create<scf::YieldOp>(loc, reduc);
+ }
+
+ // Finished iterating a tensor, clean up
+ // We only do the clean up on for loop as while loops do not necessarily
+ // finish the iteration on a sparse tensor
+ for (auto [tid, dim] : llvm::zip(tids, dims)) {
// Reset to null.
- pidxs[tid][dim] = Value();
coord[tid][dim] = Value();
- if (!isDenseDLT(dims[tid][dim]))
- // Dense dimension, high is fixed.
+ pidxs[tid][dim] = Value();
+ // Dense dimension, high is fixed.
+ if (!isDenseDLT(dimTypes[tid][dim]))
highs[tid][dim] = Value();
}
+ // exit the loop
+ builder.setInsertionPointAfter(forOp);
+ return forOp.getResults();
+}
+
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitCoiterationLoop(OpBuilder &builder, Location loc,
+ ArrayRef<Value> reduc) {
+ auto whileOp = llvm::cast<scf::WhileOp>(loopStack.back().loop);
+ auto &dims = loopStack.back().dims;
+ auto &tids = loopStack.back().tids;
+ Value iv = loopStack.back().iv;
+ // Generation while loop induction at the end.
+ builder.setInsertionPointToEnd(&whileOp.getAfter().front());
+ // Finalize the induction. Note that the induction could be performed
+ // in the individual if-branches to avoid re-evaluating the conditions.
+ // However, that would result in a rather elaborate forest of yield
+ // instructions during code generation. Moreover, performing the induction
+ // after the if-statements more closely resembles code generated by TACO.
+ unsigned o = 0;
+ SmallVector<Value, 4> operands;
+ Value one = constantIndex(builder, loc, 1);
+ for (auto [tid, dim] : llvm::zip(tids, dims)) {
+ if (isCompressedDLT(dimTypes[tid][dim]) ||
+ isSingletonDLT(dimTypes[tid][dim])) {
+ Value op1 = coord[tid][dim];
+ Value op3 = pidxs[tid][dim];
+ Value cmp =
+ builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, op1, iv);
+ Value add = builder.create<arith::AddIOp>(loc, op3, one);
+ operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3));
+ // Following loops continue iteration from the break point of the
+ // current while loop.
+ pidxs[tid][dim] = whileOp->getResult(o++);
+ // The coordinates are invalid now.
+ coord[tid][dim] = nullptr;
+ // highs remains unchanged.
+ }
+ }
+
+ // Reduction value from users.
+ SmallVector<Value, 2> ret;
+ for (auto red : reduc) {
+ operands.push_back(red);
+ ret.push_back(whileOp->getResult(o++));
+ }
+
+ // An (optional) universal index.
+ if (operands.size() < whileOp.getNumResults()) {
+ assert(operands.size() + 1 == whileOp.getNumResults());
+ // The last one is the universial index.
+ operands.push_back(builder.create<arith::AddIOp>(loc, iv, one));
+ // update the loop starting point of current loop sequence
+ loopSeqStack.back() = whileOp->getResult(o++);
+ }
+
+ assert(o == operands.size());
+ builder.create<scf::YieldOp>(loc, operands);
+ builder.setInsertionPointAfter(whileOp);
+ return ret;
+}
+
+SmallVector<Value, 2>
+SparseTensorLoopEmitter::exitCurrentLoop(OpBuilder &builder, Location loc,
+ ArrayRef<Value> reduc) {
+ // Clean up the values, it would help use to discover potential bug at a
+ // earlier stage (instead of silently using a wrong value).
+ LoopLevelInfo &loopInfo = loopStack.back();
+ assert(loopInfo.tids.size() == loopInfo.dims.size());
+ SmallVector<Value, 2> red;
+ if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
+ red = exitCoiterationLoop(builder, loc, reduc);
+ } else {
+ red = exitForLoop(builder, loc, reduc);
+ }
+
+ assert(loopStack.size() == loopSeqStack.size());
loopStack.pop_back();
+ return red;
}
//===----------------------------------------------------------------------===//
auto srcDim = srcShape[i];
// Iterate through dimensions expanded from the i-th dimension.
for (unsigned j = start; j < start + map.size(); j++) {
- // There can be only one dynamic sized dimension among dimensions expanded
- // from the i-th dimension in srcShape. For example, if srcDim = 8, then
- // the expanded shape could be <2x?x2>, but not <2x?x?>.
+ // There can be only one dynamic sized dimension among dimensions
+ // expanded from the i-th dimension in srcShape.
+ // For example, if srcDim = 8, then the expanded shape could be <2x?x2>,
+ // but not <2x?x?>.
if (staticDstShape[j] == ShapedType::kDynamicSize) {
// The expanded dimension has dynamic size. We compute the dimension
// by dividing srcDim by the product of the static dimensions.
enum class EmitCInterface : bool { Off = false, On = true };
//===----------------------------------------------------------------------===//
-// SparseTensorLoopEmiter class, manages sparse tensors and helps to generate
-// loop structure to (co-iterate) sparse tensors.
-//
-// An example usage:
-// To generate following loops over T1<?x?> and T2<?x?>
-//
-// for i in T1[0] {
-// for j : T2[0] {
-// for k : T1[1] {}
-// for k : T2[1] {}
-// }
-// }
-//
-// One can use
-//
-// SparseTensorLoopEmiter loopEmiter({T1, T1});
-// loopEmiter.initializeLoopEmit();
-// loopEmiter.enterLoopOverTensorAtDim(T1, 0);
-// loopEmiter.enterLoopOverTensorAtDim(T2, 0);
-// loopEmiter.enterLoopOverTensorAtDim(T1, 1);
-// loopEmiter.exitCurrentLoop();
-// loopEmiter.enterLoopOverTensorAtDim(T2, 1);
-// for 0 -> 3:
-// loopEmiter.exitCurrentLoop();
-//===----------------------------------------------------------------------===//
-
-// TODO: Sparsification should also rely on this class to generate loops.
-class SparseTensorLoopEmitter {
-public:
- /// Constructor: take an array of tensors inputs, on which the generated loops
- /// will iterate on. The index of the tensor in the array is also the
- /// tensor id (tid) used in related functions.
- explicit SparseTensorLoopEmitter(ValueRange tensors,
- bool isLastOutput = false);
-
- ///
- /// Core functions.
- ///
-
- /// Starts a loop emitting session:
- /// 1. Generates all the buffers needed to iterate tensors.
- /// 2. Generates the lo/hi bounds to iterate tensors[0].
- void initializeLoopEmit(OpBuilder &builder, Location loc);
-
- // TODO: Gets rid of `dim` in the argument list? Track the dimension we
- // are currently at internally. Then it would be enterNextDimForTensor.
-
- /// Emits loop over tensor[dim], it assumes that loops between
- /// tensor[0...dim - 1] have already been generated.
- /// It also prepares to enter tensor[dim + 1].
- Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
- size_t tid, size_t dim,
- ArrayRef<Value> reduc = {});
-
- /// Emits a coiteration loop over a set of tensors.
- // TODO: not yet implemented
- void enterCoiterationOverTensorsAtDims(OpBuilder &builder, Location loc,
- ArrayRef<size_t> ts,
- ArrayRef<size_t> ds);
-
- /// Emits extra locals, since the locals might not be in simplified lattices
- /// point used to generate the loops, but are still required to generates
- /// expressions.
- Value emitExtraLocalsForTensorsAtDims(OpBuilder &builder, Location loc,
- size_t tid, size_t dim);
-
- void exitCurrentLoop();
-
- /// Return the array of coordinate for all the loop generated till now.
- void getCoordinateArray(SmallVectorImpl<Value> &coords) {
- for (auto &l : loopStack)
- coords.push_back(l.idx);
- }
-
- ///
- /// Getters.
- ///
-
- Value getTensorValueBuffer(size_t tid) { return valBuffer[tid]; }
- Value getLastLevelTensorPointerIndex(size_t tid) {
- return pidxs[tid].back();
- };
-
-private:
- struct LoopLevelInfo {
- LoopLevelInfo(ArrayRef<size_t> ts, ArrayRef<size_t> ds, Value idx)
- : tensors(ts), dims(ds), idx(idx) {}
- llvm::SmallVector<size_t, 4> tensors;
- llvm::SmallVector<size_t, 4> dims;
- Value idx;
- };
-
- /// Return false if tid[dim] is a dense dimension that does not need to be
- /// prepared (to be used by sparsification for needUniv).
- bool prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid,
- size_t dim);
-
- /// Input (TODO: and output) tensors.
- std::vector<Value> tensors;
- /// The dim type array for each tensor.
- std::vector<std::vector<DimLevelType>> dims;
- /// Sparse iteration information (by tensor and dim). These arrays
- /// are updated to remain current within the current loop.
- std::vector<std::vector<Value>> pidxs;
- std::vector<std::vector<Value>> coord;
- std::vector<std::vector<Value>> highs;
- /// Universal dense indices and upper bounds (by index). The sizes array is
- /// set once with the inferred dimension sizes.
- std::vector<std::vector<Value>> sizes;
- std::vector<std::vector<Value>> ptrBuffer; // to_pointers
- std::vector<std::vector<Value>> idxBuffer; // to_indices
- std::vector<Value> valBuffer; // to_value
-
- bool isLastOutput; // Is the last tensor output tensor
- std::vector<LoopLevelInfo> loopStack;
- // TODO: not yet used, it should track the current level for each tensor
- // to help eliminate `dim` paramters from above APIs.
- std::vector<size_t> curLv;
-};
-
-//===----------------------------------------------------------------------===//
// ExecutionEngine/SparseTensorUtils helper functions.
//===----------------------------------------------------------------------===//
return constantI8(builder, loc, static_cast<uint8_t>(dlt));
}
+inline bool isZeroRankedTensorOrScalar(Type type) {
+ auto rtp = type.dyn_cast<RankedTensorType>();
+ return !rtp || rtp.getRank() == 0;
+}
+
+//===----------------------------------------------------------------------===//
+// SparseTensorLoopEmiter class, manages sparse tensors and helps to generate
+// loop structure to (co)-iterate sparse tensors.
+//
+// An example usage:
+// To generate the following loops over T1<?x?> and T2<?x?>
+//
+// for i in TENSOR_1_0 {
+// for j : TENSOR_2_0 {
+// for k : TENSOR_1_1 {}
+// for k : TENSOR_2_1 {}
+// }
+// }
+//
+// One can use
+//
+// SparseTensorLoopEmiter loopEmiter({T1, T1});
+// loopEmiter.initializeLoopEmit();
+// loopEmiter.enterLoopOverTensorAtDim(T1, 0);
+// loopEmiter.enterLoopOverTensorAtDim(T2, 0);
+// loopEmiter.enterLoopOverTensorAtDim(T1, 1);
+// loopEmiter.exitCurrentLoop();
+// loopEmiter.enterLoopOverTensorAtDim(T2, 1);
+// loopEmiter.exitCurrentLoop(); // exit k
+// loopEmiter.exitCurrentLoop(); // exit j
+// loopEmiter.exitCurrentLoop(); // exit i
+//===----------------------------------------------------------------------===//
+
+// TODO: Sparsification should also rely on this class to generate loops.
+class SparseTensorLoopEmitter {
+public:
+ /// Optional callback function to setup dense output tensors when
+ /// initializing the loop emitter (e.g., to fill a dense output with zeros).
+ using OutputUpdater = function_ref<Value(OpBuilder &builder, Location loc,
+ Value memref, Value tensor)>;
+
+ /// Constructor: take an array of tensors inputs, on which the generated loops
+ /// will iterate on. The index of the tensor in the array is also the
+ /// tensor id (tid) used in related functions.
+ /// If isSparseOut is set, loop emitter assume that the sparse output tensor
+ /// is empty, and will always generate loops on it based on the dim sizes.
+ explicit SparseTensorLoopEmitter(ValueRange tensors, bool hasOutput = false,
+ bool isSparseOut = false);
+
+ /// Starts a loop emitting session by generating all the buffers needed to
+ /// iterate tensors.
+ void initializeLoopEmit(OpBuilder &builder, Location loc,
+ OutputUpdater updater = nullptr);
+
+ /// Enters a new loop sequence, the loops within the same sequence starts from
+ /// the break points of previous loop instead of starting over from 0.
+ /// e.g.,
+ /// {
+ /// // loop sequence start.
+ /// p0 = while(xxx)
+ /// ...
+ /// break p0
+ ///
+ /// // Starts loop from p0
+ /// for (i = p0; i < end; i++)
+ /// ...
+ /// // loop sequence end.
+ /// }
+ void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+ ArrayRef<size_t> dims);
+
+ // exit the current loop sequence, this will reset universal index to 0.
+ void exitCurrentLoopSeq() {
+ assert(loopSeqStack.size() == loopStack.size() + 1);
+ loopSeqStack.pop_back();
+ }
+
+ // TODO: Gets rid of `dim` in the argument list? Track the dimension we
+ // are currently at internally. Then it would be enterNextDimForTensor.
+ // Still need a way to specify the dim for non annoated dense tensor though,
+ // as it can be accessed out of order.
+ /// Emits loop over tensor_tid_dim, it assumes that loops between
+ /// tensor_tid_[0, dim - 1] have already been generated.
+ /// The function will also perform in-place update on the `reduc` vector to
+ /// return the reduction variable used inside the generated loop.
+ Operation *enterLoopOverTensorAtDim(OpBuilder &builder, Location loc,
+ size_t tid, size_t dim,
+ MutableArrayRef<Value> reduc = {},
+ bool isParallel = false,
+ ArrayRef<size_t> extraTids = {},
+ ArrayRef<size_t> extraDims = {});
+
+ /// Emits a co-iteration loop over a set of tensors.
+ Operation *enterCoIterationOverTensorsAtDims(
+ OpBuilder &builder, Location loc, ArrayRef<size_t> tids,
+ ArrayRef<size_t> dims, bool needsUniv, MutableArrayRef<Value> reduc = {},
+ ArrayRef<size_t> extraTids = {}, ArrayRef<size_t> extraDims = {});
+
+ SmallVector<Value, 2> exitCurrentLoop(OpBuilder &builder, Location loc,
+ ArrayRef<Value> reduc = {});
+
+ /// Returns the array of coordinate for all the loop generated till now.
+ void getCoordinateArray(SmallVectorImpl<Value> &coords) const {
+ for (auto &l : loopStack)
+ coords.push_back(l.iv);
+ }
+
+ /// Gets loop induction variable at the given level.
+ Value getLoopIV(size_t level) const {
+ if (level < loopStack.size())
+ return loopStack[level].iv;
+ return nullptr;
+ }
+
+ ///
+ /// Getters.
+ ///
+ const std::vector<std::vector<Value>> &getPidxs() const { return pidxs; };
+ const std::vector<std::vector<Value>> &getCoord() const { return coord; };
+ const std::vector<std::vector<Value>> &getHighs() const { return highs; };
+ const std::vector<std::vector<Value>> &getPtrBuffer() const {
+ return ptrBuffer;
+ };
+ const std::vector<std::vector<Value>> &getIdxBuffer() const {
+ return idxBuffer;
+ };
+ const std::vector<Value> &getValBuffer() const { return valBuffer; };
+
+private:
+ struct LoopLevelInfo {
+ LoopLevelInfo(ArrayRef<size_t> tids, ArrayRef<size_t> dims, Operation *loop,
+ Value iv)
+ : tids(tids), dims(dims), loop(loop), iv(iv) {}
+ // TODO: maybe use a vector<pair> for tid and dim?
+ // The set of tensors that the loop is operating on
+ const llvm::SmallVector<size_t, 4> tids;
+ // The corresponding dims for the tensors
+ const llvm::SmallVector<size_t, 4> dims;
+ const Operation *loop; // the loop operation
+ const Value iv; // the induction variable for the loop
+ };
+
+ /// Linearizes address for dense dimension (i.e., p = (i * d0) + j).
+ Value genAddress(OpBuilder &builder, Location loc, size_t tid, size_t dim,
+ Value iv) {
+ Value p = dim == 0 ? constantIndex(builder, loc, 0) : pidxs[tid][dim - 1];
+ Value mul = builder.create<arith::MulIOp>(loc, highs[tid][dim], p);
+ Value add = builder.create<arith::AddIOp>(loc, mul, iv);
+ return add;
+ }
+
+ bool isOutputTensor(size_t tid) {
+ return hasOutput && tid == tensors.size() - 1;
+ }
+
+ /// Setups [lo, hi] for iterating tensor[dim], it assumes that tensor[0
+ /// ...dims-1] has already been setup.
+ void prepareLoopOverTensorAtDim(OpBuilder &builder, Location loc, size_t tid,
+ size_t dim);
+
+ /// Emits extra locals, since the locals might not be in simplified lattices
+ /// point used to generate the loops, but are still required to generates
+ /// expressions.
+ void emitExtraLocalsForTensorsAtDenseDims(OpBuilder &builder, Location loc,
+ ArrayRef<size_t> tids,
+ ArrayRef<size_t> dims);
+
+ /// Exits a for loop, returns the reduction results, e.g.,
+ /// %ret = for () {
+ /// ...
+ /// yield %val
+ /// }
+ /// Return %ret to user, while %val is provided by users (`reduc`)
+ SmallVector<Value, 2> exitForLoop(OpBuilder &builder, Location loc,
+ ArrayRef<Value> reduc);
+
+ /// Exits a while loop, returns the reduction results.
+ SmallVector<Value, 2> exitCoiterationLoop(OpBuilder &builder, Location loc,
+ ArrayRef<Value> reduc);
+
+ // Whether the loop emitter needs to treat the last tensor as the output
+ // tensor.
+ bool hasOutput;
+ /// Input and (optional) output tensors.
+ std::vector<Value> tensors;
+ /// The dim type array for each tensor.
+ std::vector<std::vector<DimLevelType>> dimTypes;
+ /// Sparse iteration information (by tensor and dim). These arrays
+ /// are updated to remain current within the current loop.
+ std::vector<std::vector<Value>> pidxs;
+ std::vector<std::vector<Value>> coord;
+ std::vector<std::vector<Value>> highs;
+ std::vector<std::vector<Value>> ptrBuffer; // to_pointers
+ std::vector<std::vector<Value>> idxBuffer; // to_indices
+ std::vector<Value> valBuffer; // to_value
+
+ // Loop Stack, stores the information of all the nested loops that are alive.
+ std::vector<LoopLevelInfo> loopStack;
+
+ // Loop Sequence Stack, stores the unversial index for the current loop
+ // sequence.
+ std::vector<Value> loopSeqStack;
+
+ // TODO: not yet used, it should track the current level for each tensor
+ // to help eliminate `dim` paramters from above APIs.
+ // std::vector<size_t> curLv;
+};
+
} // namespace sparse_tensor
} // namespace mlir
// 1. Generates loop for the sparse input.
SparseTensorLoopEmitter loopEmitter(ValueRange{input});
loopEmitter.initializeLoopEmit(rewriter, loc);
- for (int64_t i = 0; i < rank; i++)
+ for (int64_t i = 0; i < rank; i++) {
+ // TODO: provide utility function for loop sequences that only contains
+ // one for loop?
+ loopEmitter.enterNewLoopSeq(rewriter, loc, 0, static_cast<size_t>(i));
loopEmitter.enterLoopOverTensorAtDim(rewriter, loc, 0, i);
+ }
SmallVector<Value, 4> coords;
coords.reserve(rank);
loopEmitter.getCoordinateArray(coords);
- Value vals = loopEmitter.getTensorValueBuffer(0);
- Value pidx = loopEmitter.getLastLevelTensorPointerIndex(0);
+ Value vals = loopEmitter.getValBuffer()[0];
+ Value pidx = loopEmitter.getPidxs()[0].back();
// Loads the value from sparse tensor using pointer index;
// loads the value from dense tensor using coordinate array.
Value val = enc ? rewriter.create<memref::LoadOp>(loc, vals, pidx)
: rewriter.create<memref::LoadOp>(loc, vals, coords);
- for (int64_t i = 0; i < rank; i++)
- loopEmitter.exitCurrentLoop();
-
// 2. Inline the block in the foreach operator.
Block::iterator inlinePos = rewriter.getInsertionPoint();
Block *srcBlock = op.getBody();
// Remove sparse_tensor.yield.
rewriter.eraseOp(srcBlock->getTerminator());
+ for (int64_t i = 0; i < rank; i++) {
+ loopEmitter.exitCurrentLoop(rewriter, loc);
+ loopEmitter.exitCurrentLoopSeq();
+ }
+
SmallVector<Value, 4> args;
// Remap coordinates.
for (int64_t i = 0; i < rank; i++) {
namespace {
+constexpr unsigned INVALID_ID = std::numeric_limits<unsigned>::max();
+
// Iteration graph sorting.
enum SortMask {
kSparseOnly = 0x0,
// Code generation.
struct CodeGen {
- CodeGen(SparsificationOptions o, unsigned numTensors, unsigned numLoops,
- OpOperand *op, unsigned nest, std::vector<unsigned> &ts)
- : options(o), loops(numLoops), sizes(numLoops), buffers(numTensors),
- pointers(numTensors, std::vector<Value>(numLoops)),
- indices(numTensors, std::vector<Value>(numLoops)),
- highs(numTensors, std::vector<Value>(numLoops)),
- pidxs(numTensors, std::vector<Value>(numLoops)),
- idxs(numTensors, std::vector<Value>(numLoops)), sparseOut(op),
- outerParNest(nest), topSort(ts) {}
+ CodeGen(SparsificationOptions o, ValueRange tensors, unsigned numTensors,
+ unsigned numLoops, OpOperand *op, unsigned nest,
+ std::vector<unsigned> &ts)
+ : options(o), loopEmitter(tensors, /*isLastOutput=*/true,
+ /*isSparseOut=*/op != nullptr),
+ sparseOut(op), outerParNest(nest), topSort(ts) {}
/// Sparsification options.
SparsificationOptions options;
- /// Universal dense indices and upper bounds (by index). The loops array
- /// is updated with the value of the universal dense index in the current
- /// loop. The sizes array is set once with the inferred dimension sizes.
- std::vector<Value> loops;
- std::vector<Value> sizes;
- /// Buffers for storing dense and sparse numerical values (by tensor).
- /// This array is set once during bufferization of all tensors.
- std::vector<Value> buffers;
- /// Sparse storage schemes (1-D): pointers and indices (by tensor and index).
- /// This array is set once during bufferization of all sparse tensors.
- std::vector<std::vector<Value>> pointers;
- std::vector<std::vector<Value>> indices;
- /// Sparse iteration information (by tensor and index). These arrays
- /// are updated to remain current within the current loop.
- std::vector<std::vector<Value>> highs;
- std::vector<std::vector<Value>> pidxs;
- std::vector<std::vector<Value>> idxs;
+ /// Loop emitter helper class.
+ SparseTensorLoopEmitter loopEmitter;
/// Current reduction, updated during code generation. When indices of a
/// reduction are exhausted, all inner loops can use a scalarized reduction.
unsigned redExp = -1u;
Value expCount;
// Topsort (reference should remain in scope).
std::vector<unsigned> &topSort;
+
+ // From tensor id + loop id => dim id.
+ // TODO: This map should probably be maintained by Merger (it can be set up
+ // together with dimLvlType Map).
+ std::vector<std::vector<unsigned>> loopIdxToDim;
+
+ // Initialize the above two mapping.
+ void buildLoopIdxToDimMap(linalg::GenericOp op);
+
+ Value getLoopIdxValue(size_t loopIdx) const {
+ for (unsigned lv = 0; lv < topSort.size(); lv++)
+ if (topSort[lv] == loopIdx)
+ return loopEmitter.getLoopIV(lv);
+
+ llvm_unreachable("invalid loop index");
+ }
};
+void CodeGen::buildLoopIdxToDimMap(linalg::GenericOp op) {
+ size_t numLoops = op.getNumLoops();
+ size_t numTensors = op.getNumOperands();
+ loopIdxToDim.assign(numTensors, std::vector<unsigned>(numLoops, INVALID_ID));
+
+ for (OpOperand &t : op->getOpOperands()) {
+ auto map = op.getMatchingIndexingMap(&t);
+ auto enc = getSparseTensorEncoding(t.get().getType());
+ // Scan all dimensions of current tensor.
+ unsigned tid = t.getOperandNumber();
+ for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
+ auto a = map.getResult(toOrigDim(enc, d)).dyn_cast<AffineDimExpr>();
+ if (a) {
+ unsigned loopId = a.getPosition();
+ // Fills the mapping.
+ loopIdxToDim[tid][loopId] = d;
+ }
+ // Else a compound affine, do nothing. (at least we are good for
+ // now, as we only support compound affine expr on non-annoated dense
+ // tensors).
+ }
+ }
+}
+
} // namespace
//===----------------------------------------------------------------------===//
/// same index is used more than once. Also rejects compound affine
/// expressions in sparse dimensions.
static bool findAffine(Merger &merger, unsigned tensor, AffineExpr a,
- DimLevelType dim) {
+ DimLevelType dim, bool setLvlFormat = true) {
switch (a.getKind()) {
case AffineExprKind::DimId: {
unsigned idx = a.cast<AffineDimExpr>().getPosition();
if (!isUndefDLT(merger.getDimLevelType(tensor, idx)))
return false; // used more than once
- merger.setDimLevelType(tensor, idx, dim);
+
+ if (setLvlFormat)
+ merger.setDimLevelType(tensor, idx, dim);
return true;
}
case AffineExprKind::Add:
if (!isDenseDLT(dim))
return false; // compound only in dense dim
auto binOp = a.cast<AffineBinaryOpExpr>();
- return findAffine(merger, tensor, binOp.getLHS(), dim) &&
- findAffine(merger, tensor, binOp.getRHS(), dim);
+ // We do not set dim level format for affine expresssion like d0 + d1 on
+ // both loop index at d0 and d1,
+ return findAffine(merger, tensor, binOp.getLHS(), dim, false) &&
+ findAffine(merger, tensor, binOp.getRHS(), dim, false);
}
case AffineExprKind::Constant:
return isDenseDLT(dim); // const only in dense dim
// Sparse compiler synthesis methods (statements and expressions).
//===----------------------------------------------------------------------===//
-/// Generates buffer for the output tensor. Note that all sparse kernels
-/// assume that when all elements are written to (viz. x(i) = y(i) * z(i)),
-/// the output buffer is already initialized to all zeroes and only nonzeroes
-/// values are computed and written out. For updates (viz. x(i) += y(i) * z(i)),
-/// only nonzeroes values are used for the updates and no assumption on the
-/// original contents of the output buffer is necessary.
-static Value genOutputBuffer(CodeGen &codegen, OpBuilder &builder,
- linalg::GenericOp op, MemRefType denseTp,
- ArrayRef<Value> args) {
- Location loc = op.getLoc();
- OpOperand *lhs = op.getOutputOperand(0);
- Value tensor = lhs->get();
- bool isInit = op.isInitTensor(lhs);
- // An output tensor can simply materialize from the buffer of the tensor that
- // appears in the outs() clause. For updates, this has the advantage that only
- // the nonzero value are involved in the computation, keeping the operation
- // O(nnz). In all other cases, we are forced to zero out the buffer to enforce
- // the assumption above, which may negatively impact running complexity
- // (viz. O(n^2 + nnz) vs. O(nnz) for matrices).
- // TODO: use better analysis to avoid zeroing out the buffer?
- Value init = builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
- if (!isInit) {
- Value zero = constantZero(builder, loc, denseTp.getElementType());
- builder.create<linalg::FillOp>(loc, ValueRange{zero}, ValueRange{init});
- }
- return init;
-}
-
/// Local bufferization of all dense and sparse data structures.
static void genBuffers(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op) {
Location loc = op.getLoc();
- assert(op->getNumOperands() == op.getNumInputs() + 1);
- // For every tensor, find lower and upper bound on dimensions, set the
- // same bounds on loop indices, and obtain dense or sparse buffer(s).
- auto dynShape = {ShapedType::kDynamicSize};
- SmallVector<Value, 4> args;
- for (OpOperand &t : op->getOpOperands()) {
- unsigned tensor = t.getOperandNumber();
- auto shape = op.getShape(&t);
- auto map = op.getMatchingIndexingMap(&t);
- auto enc = getSparseTensorEncoding(t.get().getType());
- // Scan all dimensions of current tensor.
- args.clear();
- for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
- AffineExpr a = map.getResult(toOrigDim(enc, d));
- if (a.getKind() != AffineExprKind::DimId)
- continue; // compound
- unsigned idx = a.cast<AffineDimExpr>().getPosition();
- // Handle the different storage schemes.
- if (isCompressedDLT(merger.getDimLevelType(tensor, idx))) {
- // Compressed dimension, fetch pointer and indices.
- auto ptrTp =
- MemRefType::get(dynShape, getPointerOverheadType(builder, enc));
- auto indTp =
- MemRefType::get(dynShape, getIndexOverheadType(builder, enc));
- auto dim = builder.getIndexAttr(d);
- codegen.pointers[tensor][idx] =
- builder.create<ToPointersOp>(loc, ptrTp, t.get(), dim);
- codegen.indices[tensor][idx] =
- builder.create<ToIndicesOp>(loc, indTp, t.get(), dim);
- } else if (isSingletonDLT(merger.getDimLevelType(tensor, idx))) {
- // Singleton dimension, fetch indices.
- auto indTp =
- MemRefType::get(dynShape, getIndexOverheadType(builder, enc));
- auto dim = builder.getIndexAttr(d);
- codegen.indices[tensor][idx] =
- builder.create<ToIndicesOp>(loc, indTp, t.get(), dim);
- } else {
- // Dense dimension, nothing to fetch.
- assert(isDenseDLT(merger.getDimLevelType(tensor, idx)));
- }
- // Find upper bound in current dimension.
- unsigned p = toOrigDim(enc, d);
- Value up = linalg::createOrFoldDimOp(builder, loc, t.get(), p);
- if (ShapedType::isDynamic(shape[p]))
- args.push_back(up);
- assert(codegen.highs[tensor][idx] == nullptr);
- codegen.sizes[idx] = codegen.highs[tensor][idx] = up;
- }
- // Perform the required bufferization. Dense inputs materialize
- // from the input tensors. Dense outputs need special handling.
- // Sparse inputs use sparse primitives to obtain the values.
- Type elementType = getElementTypeOrSelf(t.get().getType());
- if (!enc) {
- // Non-annotated dense tensors.
- auto denseTp = MemRefType::get(shape, elementType);
- if (tensor < op.getNumInputs())
- codegen.buffers[tensor] =
- builder.create<bufferization::ToMemrefOp>(loc, denseTp, t.get());
- else
- codegen.buffers[tensor] =
- genOutputBuffer(codegen, builder, op, denseTp, args);
- } else if (&t != codegen.sparseOut) {
- // Annotated sparse tensors (not involved in output).
- auto sparseTp = MemRefType::get(dynShape, elementType);
- codegen.buffers[tensor] =
- builder.create<ToValuesOp>(loc, sparseTp, t.get());
- }
- }
+ assert(op.getNumOperands() == op.getNumInputs() + 1);
+
+ codegen.loopEmitter.initializeLoopEmit(
+ builder, loc,
+ /// Generates buffer for the output tensor.
+ /// Note that all sparse kernels assume that when all elements are written
+ /// to (viz. x(i) = y(i) * z(i)), the output buffer is already initialized
+ /// to all zeroes and only nonzeroes values are computed and written out.
+ /// For updates (viz. x(i) += y(i) * z(i)), only nonzeroes values are used
+ /// for the updates and no assumption on the original contents of the
+ /// output buffer is necessary.
+ [&op](OpBuilder &builder, Location loc, Value memref,
+ Value tensor) -> Value {
+ // Must not be a sparse tensor.
+ assert(!getSparseTensorEncoding(tensor.getType()));
+ OpOperand *lhs = op.getOutputOperand(0);
+ // Two output tensors references should pointed to the same object.
+ assert(lhs->get() == tensor);
+ bool isInit = op.isInitTensor(lhs);
+ // An output tensor can simply materialize from the buffer of the tensor
+ // that appears in the outs() clause. For updates, this has the
+ // advantage that only the nonzero value are involved in the
+ // computation, keeping the operation O(nnz). In all other cases, we are
+ // forced to zero out the buffer to enforce the assumption above, which
+ // may negatively impact running complexity (viz. O(n^2 + nnz) vs.
+ // O(nnz) for matrices).
+ // TODO: use better analysis to avoid zeroing out the buffer?
+ Value init = memref;
+ if (!isInit) {
+ Value zero = constantZero(builder, loc,
+ getElementTypeOrSelf(tensor.getType()));
+ builder.create<linalg::FillOp>(loc, ValueRange{zero},
+ ValueRange{init});
+ }
+ return init;
+ });
}
/// Generates an affine expression.
switch (a.getKind()) {
case AffineExprKind::DimId: {
unsigned idx = a.cast<AffineDimExpr>().getPosition();
- return codegen.loops[idx]; // universal dense index
+ return codegen.getLoopIdxValue(idx); // universal dense index
}
case AffineExprKind::Add: {
auto binOp = a.cast<AffineBinaryOpExpr>();
AffineExpr a = map.getResult(toOrigDim(enc, map.getNumResults() - 1));
assert(a.getKind() == AffineExprKind::DimId);
unsigned idx = a.cast<AffineDimExpr>().getPosition();
- return codegen.loops[idx];
+ return codegen.getLoopIdxValue(idx);
}
/// Generates subscript for load/store on a dense or sparse tensor.
if (enc) {
// Note that currently, all sparse subscripts are simple.
// TODO: accept affine too?
- AffineExpr a = map.getResult(toOrigDim(enc, rank - 1));
- assert(a.getKind() == AffineExprKind::DimId);
- unsigned idx = a.cast<AffineDimExpr>().getPosition();
- assert(codegen.pidxs[tensor][idx] != nullptr);
- args.push_back(codegen.pidxs[tensor][idx]); // position index
+ assert(map.getResult(toOrigDim(enc, rank - 1)).getKind() ==
+ AffineExprKind::DimId);
+ Value pidx = codegen.loopEmitter.getPidxs()[tensor].back();
+ assert(pidx);
+ args.push_back(pidx); // position index
} else {
for (unsigned d = 0; d < rank; d++) {
AffineExpr a = map.getResult(d);
args.push_back(genAffine(codegen, builder, a, op.getLoc()));
}
}
- return codegen.buffers[tensor];
+ return codegen.loopEmitter.getValBuffer()[tensor];
}
/// Generates insertion code to implement dynamic tensor load.
unsigned rank = op.getRank(t);
SmallVector<Value, 4> indices;
for (unsigned i = 0; i < rank; i++) {
- assert(codegen.loops[codegen.topSort[i]]);
- indices.push_back(codegen.loops[codegen.topSort[i]]);
+ assert(codegen.loopEmitter.getLoopIV(i));
+ indices.push_back(codegen.loopEmitter.getLoopIV(i));
}
builder.create<InsertOp>(loc, rhs, t->get(), indices);
return;
builder.create<memref::StoreOp>(loc, rhs, ptr, args);
}
-/// Generates a pointer/index load from the sparse storage scheme. Narrower
-/// data types need to be zero extended before casting the value into the
-/// index type used for looping and indexing.
-static Value genLoad(CodeGen &codegen, OpBuilder &builder, Location loc,
- Value ptr, Value s) {
- // Simply zero extends narrower indices into 64-bit values before casting to
- // index without a performance penalty.
- Value load = builder.create<memref::LoadOp>(loc, ptr, s);
- if (!load.getType().isa<IndexType>()) {
- if (load.getType().getIntOrFloatBitWidth() < 64)
- load = builder.create<arith::ExtUIOp>(loc, builder.getI64Type(), load);
- load =
- builder.create<arith::IndexCastOp>(loc, builder.getIndexType(), load);
- }
- return load;
-}
-
/// Generates an invariant value.
-static Value genInvariantValue(Merger &merger, CodeGen &codegen,
- OpBuilder &builder, unsigned exp) {
- Value val = merger.exp(exp).val;
- return val;
-}
-
-/// Generates an address computation "sz * p + i".
-static Value genAddress(CodeGen &codegen, OpBuilder &builder, Location loc,
- Value size, Value p, Value i) {
- Value mul = builder.create<arith::MulIOp>(loc, size, p);
- return builder.create<arith::AddIOp>(loc, mul, i);
+inline static Value genInvariantValue(Merger &merger, CodeGen &codegen,
+ OpBuilder &builder, unsigned exp) {
+ return merger.exp(exp).val;
}
/// Generates an index value.
-static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, unsigned idx,
- unsigned ldx) {
- Value ival = codegen.loops[idx];
- return ival;
+inline static Value genIndexValue(CodeGen &codegen, OpBuilder &builder,
+ unsigned idx) {
+ return codegen.getLoopIdxValue(idx);
}
/// Semi-ring branches are simply inlined by the sparse compiler. Prior
Block *block, Value e, unsigned ldx) {
if (Operation *def = e.getDefiningOp()) {
if (auto indexOp = dyn_cast<linalg::IndexOp>(def))
- return genIndexValue(codegen, rewriter, indexOp.getDim(), ldx);
+ return genIndexValue(codegen, rewriter, indexOp.getDim());
if (def->getBlock() == block) {
for (unsigned i = 0, n = def->getNumOperands(); i < n; i++)
def->setOperand(
if (merger.exp(exp).kind == Kind::kInvariant)
return genInvariantValue(merger, codegen, rewriter, exp);
if (merger.exp(exp).kind == Kind::kIndex)
- return genIndexValue(codegen, rewriter, merger.exp(exp).index, ldx);
+ return genIndexValue(codegen, rewriter, merger.exp(exp).index);
if (merger.exp(exp).kind == Kind::kReduce) {
// Make custom reduction identity accessible for expanded access pattern.
unsigned idx = a.cast<AffineDimExpr>().getPosition();
if (idx == ldx)
atLevel = true;
- return codegen.loops[idx] != nullptr; // no longer in play?
+ return codegen.getLoopIdxValue(idx) != nullptr; // no longer in play?
}
case AffineExprKind::Add:
case AffineExprKind::Mul: {
if (!lhs || codegen.outerParNest != op.getRank(lhs) - 1 ||
at != codegen.outerParNest)
return; // not needed at this level
+ assert(codegen.redVal == nullptr);
// Generate start or end of an expanded access pattern.
Value tensor = lhs->get();
Location loc = op.getLoc();
assert(codegen.expValues);
SmallVector<Value, 4> indices;
for (unsigned i = 0; i < at; i++) {
- assert(codegen.loops[codegen.topSort[i]]);
- indices.push_back(codegen.loops[codegen.topSort[i]]);
+ assert(codegen.loopEmitter.getLoopIV(i));
+ indices.push_back(codegen.loopEmitter.getLoopIV(i));
}
builder.create<CompressOp>(loc, codegen.expValues, codegen.expFilled,
codegen.expAdded, codegen.expCount, tensor,
}
}
-/// Generates initialization code for the subsequent loop sequence at
-/// current index level. Returns true if the loop sequence needs to
-/// maintain the universal index.
-static bool genInit(Merger &merger, CodeGen &codegen, OpBuilder &builder,
- linalg::GenericOp op, unsigned at, BitVector &inits) {
- std::vector<unsigned> &topSort(codegen.topSort);
- bool needsUniv = false;
- Location loc = op.getLoc();
- unsigned idx = topSort[at];
-
- // Initialize sparse positions.
- for (unsigned b = 0, be = inits.size(); b < be; b++) {
- if (!inits[b])
- continue;
- unsigned tensor = merger.tensor(b);
- assert(idx == merger.index(b));
- if (isCompressedDLT(merger.getDimLevelType(b))) {
- // Initialize sparse index that will implement the iteration:
- // for pidx_idx = pointers(pidx_idx-1), pointers(1+pidx_idx-1)
- unsigned pat = at;
- for (; pat != 0; pat--) {
- if (codegen.pidxs[tensor][topSort[pat - 1]])
- break;
- }
- Value ptr = codegen.pointers[tensor][idx];
- Value one = constantIndex(builder, loc, 1);
- Value p0 = (pat == 0) ? constantIndex(builder, loc, 0)
- : codegen.pidxs[tensor][topSort[pat - 1]];
- codegen.pidxs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p0);
- Value p1 = builder.create<arith::AddIOp>(loc, p0, one);
- codegen.highs[tensor][idx] = genLoad(codegen, builder, loc, ptr, p1);
- } else if (isSingletonDLT(merger.getDimLevelType(b))) {
- // Initialize sparse index that will implement the "iteration":
- // for pidx_idx = pidx_idx-1, 1+pidx_idx-1
- // We rely on subsequent loop unrolling to get rid of the loop
- // if it is not involved in co-iteration with anything else.
- unsigned pat = at;
- for (; pat != 0; pat--) {
- if (codegen.pidxs[tensor][topSort[pat - 1]])
- break;
- }
- Value one = constantIndex(builder, loc, 1);
- Value p0 = (pat == 0) ? constantIndex(builder, loc, 0)
- : codegen.pidxs[tensor][topSort[pat - 1]];
- codegen.pidxs[tensor][idx] = p0;
- codegen.highs[tensor][idx] = builder.create<arith::AddIOp>(loc, p0, one);
- } else {
- assert(isDenseDLT(merger.getDimLevelType(b)) ||
- isUndefDLT(merger.getDimLevelType(b)));
- // Dense index still in play.
- needsUniv = true;
- }
- }
-
- // Initialize the universal dense index.
- codegen.loops[idx] = constantIndex(builder, loc, 0);
- return needsUniv;
-}
-
-/// Returns parallelization strategy. Any implicit loop in the Linalg operation
-/// that is marked "parallel" is a candidate. Whether it is actually converted
-/// to a parallel operation depends on the requested strategy.
+/// Returns parallelization strategy. Any implicit loop in the Linalg
+/// operation that is marked "parallel" is a candidate. Whether it is actually
+/// converted to a parallel operation depends on the requested strategy.
static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
bool isSparse) {
// Reject parallelization of sparse output.
/// Generates a for-loop on a single index.
static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op, bool isOuter, bool isInner,
- unsigned idx, BitVector &indices) {
- unsigned fb = indices.find_first();
- unsigned tensor = merger.tensor(fb);
- assert(idx == merger.index(fb));
+ unsigned idx, size_t tid, size_t dim,
+ ArrayRef<size_t> extraTids,
+ ArrayRef<size_t> extraDims) {
+ Location loc = op.getLoc();
auto iteratorTypes = op.getIteratorTypesArray();
bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]);
- bool isSparse = isCompressedDLT(merger.getDimLevelType(fb)) ||
- isSingletonDLT(merger.getDimLevelType(fb));
+ bool isSparse = isCompressedDLT(merger.getDimLevelType(tid, idx)) ||
+ isSingletonDLT(merger.getDimLevelType(tid, idx));
bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse);
-
- // Loop bounds and increment.
- Location loc = op.getLoc();
- Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx];
- Value hi = isSparse ? codegen.highs[tensor][idx] : codegen.sizes[idx];
- Value step = constantIndex(builder, loc, 1);
-
- // Emit a parallel loop.
- if (isParallel) {
- scf::ParallelOp parOp = builder.create<scf::ParallelOp>(loc, lo, hi, step);
- if (isSparse)
- codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0];
- else
- codegen.loops[idx] = parOp.getInductionVars()[0];
- builder.setInsertionPointToStart(parOp.getBody());
- return parOp;
- }
+ assert(!isParallel);
// Emit a sequential or vector loop.
SmallVector<Value, 4> operands;
if (codegen.expValues)
operands.push_back(codegen.expCount);
- scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, operands);
+ Operation *loop = codegen.loopEmitter.enterLoopOverTensorAtDim(
+ builder, loc, tid, dim, operands, isParallel, extraTids, extraDims);
+ // The operands should be updated by loop emitter already.
if (codegen.redVal)
- updateReduc(merger, codegen, forOp.getRegionIterArgs().front());
+ updateReduc(merger, codegen, operands.front());
if (codegen.expValues)
- codegen.expCount = forOp.getRegionIterArgs().back();
- // Assign induction variable to sparse or dense index.
- Value iv = forOp.getInductionVar();
- if (isSparse)
- codegen.pidxs[tensor][idx] = iv;
- else
- codegen.loops[idx] = iv;
-
- builder.setInsertionPointToStart(forOp.getBody());
- return forOp;
+ codegen.expCount = operands.back();
+
+ return loop;
}
/// Emit a while-loop for co-iteration over multiple indices.
static Operation *genWhile(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op, unsigned idx, bool needsUniv,
- BitVector &indices) {
- SmallVector<Type, 4> types;
+ ArrayRef<size_t> condTids, ArrayRef<size_t> condDims,
+ ArrayRef<size_t> extraTids,
+ ArrayRef<size_t> extraDims) {
+
SmallVector<Value, 4> operands;
+
// Construct the while-loop with a parameter for each index.
- Type indexType = builder.getIndexType();
- for (unsigned b = 0, be = indices.size(); b < be; b++) {
- if (!indices[b])
- continue;
- if (isCompressedDLT(merger.getDimLevelType(b)) ||
- isSingletonDLT(merger.getDimLevelType(b))) {
- unsigned tensor = merger.tensor(b);
- assert(idx == merger.index(b));
- types.push_back(indexType);
- operands.push_back(codegen.pidxs[tensor][idx]);
- } else {
- assert(isDenseDLT(merger.getDimLevelType(b)) ||
- isUndefDLT(merger.getDimLevelType(b)));
- }
- }
- if (codegen.redVal) {
- types.push_back(codegen.redVal.getType());
+ if (codegen.redVal)
operands.push_back(codegen.redVal);
- }
- if (codegen.expValues) {
- types.push_back(indexType);
+ if (codegen.expValues)
operands.push_back(codegen.expCount);
- }
- if (needsUniv) {
- types.push_back(indexType);
- operands.push_back(codegen.loops[idx]);
- }
- assert(types.size() == operands.size());
- Location loc = op.getLoc();
- scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
- SmallVector<Location> locs(types.size(), loc);
- Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
- Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
+ Operation *loop = codegen.loopEmitter.enterCoIterationOverTensorsAtDims(
+ builder, op.getLoc(), condTids, condDims, needsUniv, operands, extraTids,
+ extraDims);
- // Build the "before" region, which effectively consists
- // of a conjunction of "i < upper" tests on all induction.
- builder.setInsertionPointToStart(&whileOp.getBefore().front());
- Value cond;
- unsigned o = 0;
- for (unsigned b = 0, be = indices.size(); b < be; b++) {
- if (!indices[b])
- continue;
- if (isCompressedDLT(merger.getDimLevelType(b)) ||
- isSingletonDLT(merger.getDimLevelType(b))) {
- unsigned tensor = merger.tensor(b);
- assert(idx == merger.index(b));
- Value op1 = before->getArgument(o);
- Value op2 = codegen.highs[tensor][idx];
- Value opc = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::ult,
- op1, op2);
- cond = cond ? builder.create<arith::AndIOp>(loc, cond, opc) : opc;
- codegen.pidxs[tensor][idx] = after->getArgument(o++);
- } else {
- assert(isDenseDLT(merger.getDimLevelType(b)) ||
- isUndefDLT(merger.getDimLevelType(b)));
- }
- }
if (codegen.redVal)
- updateReduc(merger, codegen, after->getArgument(o++));
+ updateReduc(merger, codegen, operands.front());
if (codegen.expValues)
- codegen.expCount = after->getArgument(o++);
- if (needsUniv)
- codegen.loops[idx] = after->getArgument(o++);
- assert(o == operands.size());
- builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
- builder.setInsertionPointToStart(&whileOp.getAfter().front());
- return whileOp;
+ codegen.expCount = operands.back();
+
+ return loop;
}
/// Generates a for-loop or a while-loop, depending on whether it implements
/// singleton iteration or co-iteration over the given conjunction.
static Operation *genLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op, unsigned at, bool needsUniv,
- BitVector &indices) {
+ ArrayRef<size_t> condTids, ArrayRef<size_t> condDims,
+ ArrayRef<size_t> extraTids,
+ ArrayRef<size_t> extraDims) {
+ assert(condTids.size() == condDims.size());
+ assert(extraTids.size() == extraDims.size());
unsigned idx = codegen.topSort[at];
- if (indices.count() == 1) {
+ if (condTids.size() == 1) {
bool isOuter = at == 0;
bool isInner = at == codegen.topSort.size() - 1;
- return genFor(merger, codegen, builder, op, isOuter, isInner, idx, indices);
- }
- return genWhile(merger, codegen, builder, op, idx, needsUniv, indices);
-}
-
-/// Generates the local variables for this loop, consisting of the sparse
-/// indices, restored universal dense index, and dense positions.
-static void genLocals(Merger &merger, CodeGen &codegen, OpBuilder &builder,
- linalg::GenericOp op, unsigned at, bool needsUniv,
- BitVector &locals) {
- std::vector<unsigned> &topSort(codegen.topSort);
- Location loc = op.getLoc();
- unsigned idx = topSort[at];
-
- // Initialize sparse indices.
- Value min;
- for (unsigned b = 0, be = locals.size(); b < be; b++) {
- if (!locals[b])
- continue;
- if (isCompressedDLT(merger.getDimLevelType(b)) ||
- isSingletonDLT(merger.getDimLevelType(b))) {
- unsigned tensor = merger.tensor(b);
- assert(idx == merger.index(b));
- Value ptr = codegen.indices[tensor][idx];
- Value s = codegen.pidxs[tensor][idx];
- Value load = genLoad(codegen, builder, loc, ptr, s);
- codegen.idxs[tensor][idx] = load;
- if (!needsUniv) {
- if (min) {
- Value cmp = builder.create<arith::CmpIOp>(
- loc, arith::CmpIPredicate::ult, load, min);
- min = builder.create<arith::SelectOp>(loc, cmp, load, min);
- } else {
- min = load;
- }
- }
- } else {
- assert(isDenseDLT(merger.getDimLevelType(b)) ||
- isUndefDLT(merger.getDimLevelType(b)));
- }
- }
-
- // Merge dense universal index over minimum.
- if (min) {
- assert(!needsUniv);
- codegen.loops[idx] = min;
- }
-
- // Initialize dense positions. Note that we generate dense indices of the
- // output tensor unconditionally, since they may not appear in the lattice,
- // but may be needed for linearized codegen.
- for (unsigned b = 0, be = locals.size(); b < be; b++) {
- if ((locals[b] || merger.isOutTensor(b, idx)) &&
- isDenseDLT(merger.getDimLevelType(b))) {
- unsigned tensor = merger.tensor(b);
- assert(idx == merger.index(b));
- unsigned pat = at;
- for (; pat != 0; pat--)
- if (codegen.pidxs[tensor][topSort[pat - 1]])
- break;
- Value p = (pat == 0) ? constantIndex(builder, loc, 0)
- : codegen.pidxs[tensor][topSort[pat - 1]];
- codegen.pidxs[tensor][idx] = genAddress(
- codegen, builder, loc, codegen.sizes[idx], p, codegen.loops[idx]);
- }
+ return genFor(merger, codegen, builder, op, isOuter, isInner, idx,
+ condTids.front(), condDims.front(), extraTids, extraDims);
}
+ return genWhile(merger, codegen, builder, op, idx, needsUniv, condTids,
+ condDims, extraTids, extraDims);
}
/// Generates the induction structure for a while-loop.
-static void genWhileInduction(Merger &merger, CodeGen &codegen,
- OpBuilder &builder, linalg::GenericOp op,
- unsigned idx, bool needsUniv,
- BitVector &induction, scf::WhileOp whileOp) {
+static void finalizeWhileOp(Merger &merger, CodeGen &codegen,
+ OpBuilder &builder, linalg::GenericOp op,
+ unsigned idx, bool needsUniv, BitVector &induction,
+ scf::WhileOp whileOp) {
Location loc = op.getLoc();
// Finalize each else branch of all if statements.
if (codegen.redVal || codegen.expValues) {
}
}
builder.setInsertionPointToEnd(&whileOp.getAfter().front());
- // Finalize the induction. Note that the induction could be performed
- // in the individual if-branches to avoid re-evaluating the conditions.
- // However, that would result in a rather elaborate forest of yield
- // instructions during code generation. Moreover, performing the induction
- // after the if-statements more closely resembles code generated by TACO.
- unsigned o = 0;
- SmallVector<Value, 4> operands;
- Value one = constantIndex(builder, loc, 1);
- for (unsigned b = 0, be = induction.size(); b < be; b++) {
- if (!induction[b])
- continue;
- if (isCompressedDLT(merger.getDimLevelType(b)) ||
- isSingletonDLT(merger.getDimLevelType(b))) {
- unsigned tensor = merger.tensor(b);
- assert(idx == merger.index(b));
- Value op1 = codegen.idxs[tensor][idx];
- Value op2 = codegen.loops[idx];
- Value op3 = codegen.pidxs[tensor][idx];
- Value cmp = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
- op1, op2);
- Value add = builder.create<arith::AddIOp>(loc, op3, one);
- operands.push_back(builder.create<arith::SelectOp>(loc, cmp, add, op3));
- codegen.pidxs[tensor][idx] = whileOp->getResult(o++);
- } else {
- assert(isDenseDLT(merger.getDimLevelType(b)) ||
- isUndefDLT(merger.getDimLevelType(b)));
- }
- }
- if (codegen.redVal) {
- operands.push_back(codegen.redVal);
- updateReduc(merger, codegen, whileOp->getResult(o++));
- }
- if (codegen.expValues) {
- operands.push_back(codegen.expCount);
- codegen.expCount = whileOp->getResult(o++);
- }
- if (needsUniv) {
- operands.push_back(
- builder.create<arith::AddIOp>(loc, codegen.loops[idx], one));
- codegen.loops[idx] = whileOp->getResult(o++);
- }
- assert(o == operands.size());
- builder.create<scf::YieldOp>(loc, operands);
- builder.setInsertionPointAfter(whileOp);
-}
-
-/// Generates the induction structure for a for-loop.
-static void genForInduction(Merger &merger, CodeGen &codegen,
- OpBuilder &builder, linalg::GenericOp op,
- Operation *loop) {
- Location loc = op.getLoc();
- unsigned o = 0;
- SmallVector<Value, 4> operands;
- if (codegen.redVal) {
- operands.push_back(codegen.redVal);
- updateReduc(merger, codegen, loop->getResult(o++));
- }
- if (codegen.expValues) {
- operands.push_back(codegen.expCount);
- codegen.expCount = loop->getResult(o++);
- }
- assert(o == operands.size());
- if (o > 0)
- builder.create<scf::YieldOp>(loc, operands);
- builder.setInsertionPointAfter(loop);
}
/// Generates a single if-statement within a while-loop.
Value clause;
if (isCompressedDLT(merger.getDimLevelType(b)) ||
isSingletonDLT(merger.getDimLevelType(b))) {
- Value op1 = codegen.idxs[tensor][idx];
- Value op2 = codegen.loops[idx];
+ auto dim = codegen.loopIdxToDim[tensor][idx];
+ assert(dim != INVALID_ID);
+ Value op1 = codegen.loopEmitter.getCoord()[tensor][dim];
+ Value op2 = codegen.getLoopIdxValue(idx);
clause = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, op1,
op2);
} else {
static bool startLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op, unsigned exp, unsigned at,
unsigned idx, unsigned ldx, unsigned lts) {
- assert(!codegen.loops[idx]);
+ assert(!codegen.getLoopIdxValue(idx));
// Emit invariants at this loop sequence level.
genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/true);
// Emit access pattern expansion for sparse tensor output.
genExpansion(merger, codegen, builder, op, at, /*atStart=*/true);
// Emit further intitialization at this loop sequence level.
unsigned l0 = merger.set(lts)[0];
- bool needsUniv =
- genInit(merger, codegen, builder, op, at, merger.lat(l0).bits);
+ bool needsUniv = false;
+
+ SmallVector<size_t, 4> ts;
+ SmallVector<size_t, 4> ds;
+ for (auto b : merger.lat(l0).bits.set_bits()) {
+ if (isDenseDLT(merger.getDimLevelType(b)) ||
+ isUndefDLT(merger.getDimLevelType(b))) {
+ needsUniv = true;
+ } else {
+ unsigned tensor = merger.tensor(b);
+ assert(idx == merger.index(b));
+ size_t dim = codegen.loopIdxToDim[tensor][idx];
+ assert(dim != INVALID_ID);
+ ts.push_back(tensor);
+ ds.push_back(dim);
+ }
+ }
+
+ codegen.loopEmitter.enterNewLoopSeq(builder, op.getLoc(), ts, ds);
+
// Maintain the universal index only if it is actually
// consumed by a subsequent lattice point.
if (needsUniv) {
return false;
}
+static void translateBitsToTidDimPairs(Merger &merger, CodeGen &codegen,
+ unsigned li, unsigned idx,
+ SmallVectorImpl<size_t> &condTids,
+ SmallVectorImpl<size_t> &condDims,
+ SmallVectorImpl<size_t> &extraTids,
+ SmallVectorImpl<size_t> &extraDims) {
+ const BitVector &simple = merger.lat(li).simple;
+ const BitVector &all = merger.lat(li).bits;
+ assert(simple.size() == all.size());
+ // First converts bits to array + dim pair
+ for (unsigned b = 0, e = simple.size(); b < e; b++) {
+ size_t tid = merger.tensor(b);
+ if (simple.test(b)) {
+ // the simplified condition must be a subset of the original condition.
+ assert(all.test(b));
+ assert(merger.index(b) == idx);
+ if (isUndefDLT(merger.getDimLevelType(b))) {
+ // This could be a synthetic tensor (for invariants and sparse output
+ // tensor).
+ // In both cases, we mean to generate loops over output tensor.
+ // e.g.,
+ // out[i][j] = invariant;
+ if (merger.getSynTensorID() == tid)
+ tid = merger.getOutTensorID();
+ }
+ auto dim = codegen.loopIdxToDim[tid][idx];
+ if (dim != INVALID_ID) {
+ // dim could be invalid if this is a zero ranked tensor
+ condTids.push_back(tid);
+ condDims.push_back(dim);
+ }
+ } else if ((all.test(b) || merger.isOutTensor(b, idx)) &&
+ isDenseDLT(merger.getDimLevelType(b))) {
+ assert(merger.index(b) == idx);
+ // Note that we generate dense indices of the output tensor
+ // unconditionally, since they may not appear in the lattice, but may be
+ // needed for linearized codegen.
+ // Only dense dimensions should be optimized from conditions.
+ assert(isDenseDLT(merger.getDimLevelType(b)));
+ auto dim = codegen.loopIdxToDim[tid][idx];
+ assert(dim != INVALID_ID);
+ extraTids.push_back(tid);
+ extraDims.push_back(dim);
+ }
+ }
+}
+
/// Starts a single loop in current sequence.
static Operation *startLoop(Merger &merger, CodeGen &codegen,
OpBuilder &builder, linalg::GenericOp op,
unsigned at, unsigned li, bool needsUniv) {
+ // The set of tensors + dims to generate loops on
+ SmallVector<size_t, 4> condTids, condDims;
+ // The set of (dense) tensors that is optimized from condition, yet still
+ // need extra locals to iterate on them.
+ SmallVector<size_t, 4> extraTids, extraDims;
+
+ translateBitsToTidDimPairs(merger, codegen, li, codegen.topSort[at], condTids,
+ condDims, extraTids, extraDims);
// Emit the for/while-loop control.
Operation *loop = genLoop(merger, codegen, builder, op, at, needsUniv,
- merger.lat(li).simple);
- // Emit the locals for this loop.
- genLocals(merger, codegen, builder, op, at, needsUniv, merger.lat(li).bits);
+ condTids, condDims, extraTids, extraDims);
return loop;
}
unsigned li, bool needsUniv) {
// End a while-loop.
if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) {
- genWhileInduction(merger, codegen, builder, op, idx, needsUniv,
- merger.lat(li).bits, whileOp);
- return needsUniv;
+ finalizeWhileOp(merger, codegen, builder, op, idx, needsUniv,
+ merger.lat(li).bits, whileOp);
+ } else {
+ needsUniv = false;
}
- // End a for-loop.
- genForInduction(merger, codegen, builder, op, loop);
- return false;
+
+ SmallVector<Value, 2> reduc;
+ if (codegen.redVal)
+ reduc.push_back(codegen.redVal);
+ if (codegen.expValues)
+ reduc.push_back(codegen.expCount);
+
+ auto loopRet =
+ codegen.loopEmitter.exitCurrentLoop(builder, op.getLoc(), reduc);
+ assert(reduc.size() == loopRet.size());
+
+ if (codegen.redVal)
+ updateReduc(merger, codegen, loopRet.front());
+ if (codegen.expValues)
+ codegen.expCount = loopRet.back();
+
+ return needsUniv;
}
/// Ends a loop sequence at given level.
static void endLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op, unsigned exp, unsigned at,
unsigned idx, unsigned ldx) {
- assert(codegen.loops[idx]);
- codegen.loops[idx] = Value();
+ assert(codegen.getLoopIdxValue(idx) == nullptr);
+ codegen.loopEmitter.exitCurrentLoopSeq();
// Unmark bookkeeping of invariants and loop index.
genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/false);
// Finalize access pattern expansion for sparse tensor output.
} else {
// To rematerialize an non-annotated tensor, simply load it
// from the bufferized value.
- Value val = codegen.buffers.back(); // value array
+ Value val = codegen.loopEmitter.getValBuffer().back(); // value array
rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, resType, val);
}
}
// Inadmissible expression, reject.
return failure();
- // Recursively generates code if admissible.
merger.setHasSparseOut(sparseOut != nullptr);
- CodeGen codegen(options, numTensors, numLoops, sparseOut, outerParNest,
- topSort);
+
+ SmallVector<Value, 4> tensors;
+ for (OpOperand &t : op->getOpOperands())
+ tensors.push_back(t.get());
+
+ // Recursively generates code if admissible.
+ CodeGen codegen(options, tensors, numTensors, numLoops, sparseOut,
+ outerParNest, topSort);
+ // TODO: maybe merger should be responsible of maintaining the map.
+ codegen.buildLoopIdxToDimMap(op);
genBuffers(merger, codegen, rewriter, op);
genStmt(merger, codegen, rewriter, op, exp, 0);
genResult(merger, codegen, rewriter, op);
// CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xf64>
-// CHECK: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
-// CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
+// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] {
// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_13]]] : memref<?xindex>
// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_14]]] : memref<32xf64>
// CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 1 : index} : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton" ] }>> to memref<?xf64>
-// CHECK: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x64xf64>
-// CHECK: linalg.fill ins(%[[VAL_3]] : f64) outs(%[[VAL_14]] : memref<32x64xf64>)
-// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x64xf64>
+// CHECK-DAG: linalg.fill ins(%[[VAL_3]] : f64) outs(%[[VAL_14]] : memref<32x64xf64>)
+// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
// CHECK: %[[VAL_19:.*]]:2 = scf.while (%[[VAL_20:.*]] = %[[VAL_15]], %[[VAL_21:.*]] = %[[VAL_17]]) : (index, index) -> (index, index) {
// CHECK: %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_16]] : index
// CHECK: %[[VAL_23:.*]] = arith.cmpi ult, %[[VAL_21]], %[[VAL_18]] : index
// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
+// CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32xf32>)
-// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32xf32>)
// CHECK: %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
// CHECK: %[[VAL_17:.*]] = arith.cmpi ult, %[[VAL_15]], %[[VAL_13]] : index
// CHECK: scf.condition(%[[VAL_17]]) %[[VAL_15]], %[[VAL_16]] : index, index
// CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>)
-// CHECK: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_10:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_8]] : memref<32xf32>)
// CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_3]] {
// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref<?xindex>
// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_11]]] : memref<?xf32>
// CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>)
-// CHECK: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>)
+// CHECK-DAG: %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] {
// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_12]]] : memref<?xf32>
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
// CHECK: %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
// CHECK: %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index
// CHECK: scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index
// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
-// CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
+// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] {
// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xindex>
// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_14]]] : memref<32xf32>
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
// CHECK: %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
// CHECK: %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index
// CHECK: scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index
// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
-// CHECK: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
+// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_4]] {
// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_13]]] : memref<?xindex>
// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xf32>
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
// CHECK: %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_13]], %[[VAL_19:.*]] = %[[VAL_15]]) : (index, index) -> (index, index) {
// CHECK: %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_14]] : index
// CHECK: %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_16]] : index
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
-// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
+// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
// CHECK: %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_13]], %[[VAL_19:.*]] = %[[VAL_15]]) : (index, index) -> (index, index) {
// CHECK: %[[VAL_20:.*]] = arith.cmpi ult, %[[VAL_18]], %[[VAL_14]] : index
// CHECK: %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_16]] : index
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
-// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
+// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
// CHECK: %[[VAL_18:.*]]:2 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]]) : (index, index) -> (index, index) {
// CHECK: %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index
// CHECK: %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]]
-// CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
-// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<16xf32>)
+// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
// CHECK: %[[VAL_18:.*]]:2 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]]) : (index, index) -> (index, index) {
// CHECK: %[[VAL_21:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index
// CHECK: %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
-// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref<f32>
-// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref<f32>
+// CHECK-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
// CHECK: %[[VAL_18:.*]]:3 = scf.while (%[[VAL_19:.*]] = %[[VAL_14]], %[[VAL_20:.*]] = %[[VAL_16]], %[[VAL_21:.*]] = %[[VAL_13]]) : (index, index, f32) -> (index, index, f32) {
// CHECK: %[[VAL_22:.*]] = arith.cmpi ult, %[[VAL_19]], %[[VAL_15]] : index
// CHECK: %[[VAL_23:.*]] = arith.cmpi ult, %[[VAL_20]], %[[VAL_17]] : index
// CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.indices %[[VAL_2]] {dimension = 0 : index} : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f32>
-// CHECK: %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref<f32>
-// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref<f32>
-// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref<f32>
+// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref<f32>
+// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_5]]] : memref<?xindex>
// CHECK: %[[VAL_21:.*]]:3 = scf.while (%[[VAL_22:.*]] = %[[VAL_17]], %[[VAL_23:.*]] = %[[VAL_19]], %[[VAL_24:.*]] = %[[VAL_15]]) : (index, index, f32) -> (index, index, f32) {
// CHECK: %[[VAL_25:.*]] = arith.cmpi ult, %[[VAL_22]], %[[VAL_18]] : index
// CHECK: %[[VAL_26:.*]] = arith.cmpi ult, %[[VAL_23]], %[[VAL_20]] : index
// CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.pointers %[[VAL_3]] {dimension = 0 : index} : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.indices %[[VAL_3]] {dimension = 0 : index} : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_3]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf64>
-// CHECK-DAG: %[[VAL_16:.*]] = tensor.dim %[[VAL_4]], %[[VAL_5]] : tensor<?xf64>
+// CHECK-DAG: %[[VAL_16:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?xf64>
// CHECK-DAG: %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_4]]
-// CHECK: linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_18]] : memref<?xf64>)
-// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
-// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK-DAG: linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_18]] : memref<?xf64>)
+// CHECK-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_22:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_7]]] : memref<?xindex>
// CHECK: %[[VAL_23:.*]]:3 = scf.while (%[[VAL_24:.*]] = %[[VAL_19]], %[[VAL_25:.*]] = %[[VAL_21]], %[[VAL_26:.*]] = %[[VAL_5]]) : (index, index, index) -> (index, index, index) {
// CHECK: %[[VAL_27:.*]] = arith.cmpi ult, %[[VAL_24]], %[[VAL_20]] : index
// CHECK: %[[VAL_28:.*]] = arith.cmpi ult, %[[VAL_25]], %[[VAL_22]] : index
// CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_2]] {dimension = 0 : index} : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> to memref<?xf64>
// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f64>
-// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref<f64>
-// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: %[[VAL_23:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref<f64>
+// CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_5]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_22:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_4]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_23:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_5]]] : memref<?xindex>
// CHECK: %[[VAL_24:.*]]:4 = scf.while (%[[VAL_25:.*]] = %[[VAL_18]], %[[VAL_26:.*]] = %[[VAL_20]], %[[VAL_27:.*]] = %[[VAL_22]], %[[VAL_28:.*]] = %[[VAL_17]]) : (index, index, index, f64) -> (index, index, index, f64) {
// CHECK: %[[VAL_29:.*]] = arith.cmpi ult, %[[VAL_25]], %[[VAL_19]] : index
// CHECK: %[[VAL_30:.*]] = arith.cmpi ult, %[[VAL_26]], %[[VAL_21]] : index
// CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xf64>
-// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?xf64>
+// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
// CHECK: linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_11]] : memref<?x?xf64>)
// CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
-// CHECK-DAG: %[[VAL_12:.*]] = tensor.dim %[[VAL_2]], %[[VAL_4]] : tensor<?x?xf32>
+// CHECK-DAG: %[[VAL_12:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?x?xf32>
// CHECK-DAG: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
// CHECK-DAG: %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
// CHECK-DAG: %[[VAL_19:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>> to memref<?xf32>
// CHECK-DAG: %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?xf32>
// CHECK-DAG: %[[VAL_21:.*]] = bufferization.to_memref %[[VAL_4]] : memref<f32>
-// CHECK-DAG: %[[VAL_22:.*]] = tensor.dim %[[VAL_5]], %[[VAL_6]] : tensor<?xf32>
+// CHECK-DAG: %[[VAL_22:.*]] = tensor.dim %[[VAL_2]], %[[VAL_6]] : tensor<?x?xf32,
// CHECK-DAG: %[[VAL_24:.*]] = bufferization.to_memref %[[VAL_5]] : memref<?xf32>
// CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_21]][] : memref<f32>
// CHECK: %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 2 : index} : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 2 : index} : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ] }>> to memref<?xindex>
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "dense", "compressed" ] }>> to memref<?xf32>
-// CHECK-DAG: %[[VAL_10:.*]] = tensor.dim %[[VAL_2]], %[[VAL_5]] : tensor<?x?xf32>
+// CHECK-DAG: %[[VAL_10:.*]] = tensor.dim %[[VAL_1]], %[[VAL_6]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
// CHECK-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
-// CHECK-DAG: %[[VAL_13:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?xf32>
-// CHECK-DAG: %[[VAL_14:.*]] = tensor.dim %[[VAL_0]], %[[VAL_6]] : tensor<?x?xf32>
+// CHECK-DAG: %[[VAL_13:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG: %[[VAL_14:.*]] = tensor.dim %[[VAL_2]], %[[VAL_6]] : tensor<?x?xf32>
// CHECK-DAG: %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?xf32>
// CHECK: scf.for %[[VAL_17:.*]] = %[[VAL_5]] to %[[VAL_13]] step %[[VAL_6]] {
// CHECK: scf.for %[[VAL_18:.*]] = %[[VAL_5]] to %[[VAL_10]] step %[[VAL_6]] {
// CHECK-DAG: %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32>
// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32>
// CHECK-DAG: %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?x?xf32>
-// CHECK-DAG: %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_5]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG: %[[VAL_9:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?x?xf32>
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse_tensor.encoding<{{{.*}}}>>
// CHECK-DAG: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_12]][] : memref<f32>
// CHECK: %[[TMP_7:.*]] = memref.load %[[TMP_1]][%[[TMP_c1]]] : memref<?xindex>
// CHECK: scf.for %[[TMP_arg3:.*]] = %[[TMP_6]] to %[[TMP_7]] step %[[TMP_c1]] {
// CHECK: %[[TMP_23:.*]] = memref.load %[[TMP_2]][%[[TMP_arg3]]] : memref<?xindex>
-// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-// CHECK: %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+// CHECK-DAG: %[[TMP_25:.*]] = memref.load %[[TMP_3]][%[[TMP_arg3]]] : memref<?xindex>
+// CHECK-DAG: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
// CHECK: %[[TMP_26:.*]] = memref.load %[[TMP_3]][%[[TMP_24]]] : memref<?xindex>
// CHECK: scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
// CHECK: %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
// CHECK: %[[TMP_14:.*]] = memref.load %[[TMP_8]][%[[TMP_c1]]] : memref<?xindex>
// CHECK: scf.for %[[TMP_arg3:.*]] = %[[TMP_13]] to %[[TMP_14]] step %[[TMP_c1]] {
// CHECK: %[[TMP_23:.*]] = memref.load %[[TMP_9]][%[[TMP_arg3]]] : memref<?xindex>
-// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
-// CHECK: %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+// CHECK-DAG: %[[TMP_25:.*]] = memref.load %[[TMP_10]][%[[TMP_arg3]]] : memref<?xindex>
+// CHECK-DAG: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
// CHECK: %[[TMP_26:.*]] = memref.load %[[TMP_10]][%[[TMP_24]]] : memref<?xindex>
// CHECK: scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
// CHECK: %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
// CHECK: %[[TMP_21:.*]] = memref.load %[[TMP_15]][%[[TMP_c1]]] : memref<?xindex>
// CHECK: scf.for %[[TMP_arg3:.*]] = %[[TMP_20]] to %[[TMP_21]] step %[[TMP_c1]] {
// CHECK: %[[TMP_23:.*]] = memref.load %[[TMP_16]][%[[TMP_arg3]]] : memref<?xindex>
-// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
// CHECK: %[[TMP_25:.*]] = memref.load %[[TMP_17]][%[[TMP_arg3]]] : memref<?xindex>
+// CHECK: %[[TMP_24:.*]] = arith.addi %[[TMP_arg3]], %[[TMP_c1]] : index
// CHECK: %[[TMP_26:.*]] = memref.load %[[TMP_17]][%[[TMP_24]]] : memref<?xindex>
// CHECK: scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] {
// CHECK: %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
// CHECK-DAG: %[[VAL_4:.*]] = tensor.dim %[[VAL_0]], %[[VAL_1]] : tensor<?x?xi64, #sparse_tensor.encoding
// CHECK-DAG: %[[VAL_5:.*]] = bufferization.alloc_tensor(%[[VAL_3]], %[[VAL_4]]) : tensor<?x?xi64, #sparse_tensor.encoding
// CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xi64, #sparse_tensor.encoding
-// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_5]], %[[VAL_1]] : tensor<?x?xi64, #sparse_tensor.encoding
-// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_5]], %[[VAL_2]] : tensor<?x?xi64, #sparse_tensor.encoding
+// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_1]] : tensor<?x?xi64, #sparse_tensor.encoding
+// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?xi64, #sparse_tensor.encoding
+// CHECK-DAG: %[[VAL_24:.*]] = tensor.dim %[[VAL_5]], %[[VAL_2]] : tensor<?x?xi64, #sparse_tensor.encoding
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_5]] : tensor<?x?xi64, #sparse_tensor.encoding
// CHECK: scf.for %[[VAL_10:.*]] = %[[VAL_1]] to %[[VAL_7]] step %[[VAL_2]] {
// CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_1]] to %[[VAL_8]] step %[[VAL_2]] {
// CHECK: %[[VAL_12:.*]] = arith.muli %[[VAL_8]], %[[VAL_10]] : index
// CHECK: %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[VAL_11]] : index
-// CHECK: %[[VAL_14:.*]] = arith.muli %[[VAL_8]], %[[VAL_10]] : index
+// CHECK: %[[VAL_14:.*]] = arith.muli %[[VAL_24]], %[[VAL_10]] : index
// CHECK: %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_11]] : index
// CHECK: %[[VAL_16:.*]] = arith.index_cast %[[VAL_11]] : index to i64
// CHECK: %[[VAL_17:.*]] = arith.index_cast %[[VAL_10]] : index to i64
// CHECK-HIR-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
// CHECK-HIR-DAG: %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
// CHECK-HIR: scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
-// CHECK-HIR: %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
-// CHECK-HIR: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
-// CHECK-HIR: %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index
-// CHECK-HIR: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK-HIR-DAG: %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
+// CHECK-HIR-DAG: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
+// CHECK-HIR-DAG: %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index
+// CHECK-HIR-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_15]]] : memref<?xindex>
// CHECK-HIR: scf.for %[[VAL_17:.*]] = %[[VAL_14]] to %[[VAL_16]] step %[[VAL_5]] {
-// CHECK-HIR: %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref<?xindex>
-// CHECK-HIR: %[[VAL_19:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64>
-// CHECK-HIR: %[[VAL_20:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref<?xf64>
+// CHECK-HIR-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_17]]] : memref<?xindex>
+// CHECK-HIR-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64>
+// CHECK-HIR-DAG: %[[VAL_20:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref<?xf64>
// CHECK-HIR: %[[VAL_21:.*]] = arith.mulf %[[VAL_20]], %[[VAL_13]] : f64
// CHECK-HIR: %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] : f64
// CHECK-HIR: memref.store %[[VAL_22]], %[[VAL_11]]{{\[}}%[[VAL_18]]] : memref<32xf64>
// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index
// CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-DAG: %[[VAL_6:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?x?xf32>
-// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?x?xf32>
-// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_2]] : tensor<?x?x?xf32>
+// CHECK-DAG: %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_2]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse_tensor.encoding<{{{.*}}}>>
// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?x?xf32>
// CHECK: linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_10]] : memref<?x?x?xf32>)
-// CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] {
-// CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
-// CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_8]], %[[VAL_11]] : index
+// CHECK: scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_4]] {
+// CHECK: scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] {
+// CHECK: %[[VAL_13:.*]] = arith.muli %[[VAL_7]], %[[VAL_11]] : index
// CHECK: %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index
-// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_6]] step %[[VAL_4]] {
-// CHECK: %[[VAL_16:.*]] = arith.muli %[[VAL_6]], %[[VAL_14]] : index
+// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
+// CHECK: %[[VAL_16:.*]] = arith.muli %[[VAL_8]], %[[VAL_14]] : index
// CHECK: %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_15]] : index
// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_17]]] : memref<?xf32>
// CHECK: memref.store %[[VAL_18]], %[[VAL_10]]{{\[}}%[[VAL_15]], %[[VAL_11]], %[[VAL_12]]] : memref<?x?x?xf32>
// CHECK-RWT: %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
// CHECK-RWT: scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
// CHECK-RWT: %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT: %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
-// CHECK-RWT: %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG: %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG: %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
// CHECK-RWT: %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
// CHECK-RWT: scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] {
// CHECK-RWT: %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
// CHECK-RWT: %[[E0:.*]] = memref.load %[[P0]]{{\[}}%[[C1]]] : memref<?xindex>
// CHECK-RWT: scf.for %[[I:.*]] = %[[S0]] to %[[E0]] step %[[C1]] {
// CHECK-RWT: %[[SI0:.*]] = memref.load %[[I0]]{{\[}}%[[I]]] : memref<?xindex>
-// CHECK-RWT: %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
-// CHECK-RWT: %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG: %[[S1:.*]] = memref.load %[[P1]]{{\[}}%[[I]]] : memref<?xindex>
+// CHECK-RWT-DAG: %[[PE1:.*]] = arith.addi %[[I]], %[[C1]] : index
// CHECK-RWT: %[[E1:.*]] = memref.load %[[P1]]{{\[}}%[[PE1]]] : memref<?xindex>
// CHECK-RWT: scf.for %[[J:.*]] = %[[S1]] to %[[E1]] step %[[C1]] {
// CHECK-RWT: %[[SI1:.*]] = memref.load %[[I1]]{{\[}}%[[J]]] : memref<?xindex>
// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 2.200000e+00 : f32
// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK: %[[VAL_8:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : f32
-// CHECK: %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK: %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK: %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
-// CHECK: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xf32>
-// CHECK: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : memref<32x16xf32>
-// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref<f32>
-// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
-// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_8:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : f32
+// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 1 : index} : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xindex>
+// CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse_tensor.encoding<{{.*}}>> to memref<?xf32>
+// CHECK-DAG: %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : memref<32x16xf32>
+// CHECK-DAG: %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref<f32>
+// CHECK-DAG: %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
+// CHECK-DAG: %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
// CHECK: scf.for %[[VAL_19:.*]] = %[[VAL_17]] to %[[VAL_18]] step %[[VAL_7]] {
// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_19]]] : memref<?xindex>
// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_19]]] : memref<?xindex>