Sparse compiler used to generate vectorized code for sparse tensors computation, but it should really be delegated to other vectorization passes for better progressive lowering.
https://discourse.llvm.org/t/rfc-structured-codegen-beyond-rectangular-arrays/64707
Reviewed By: aartbik
Differential Revision: https://reviews.llvm.org/D136183
mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
"any-storage-any-loop",
"Enable sparse parallelization for any storage and loop."))};
- PassOptions::Option<mlir::SparseVectorizationStrategy> vectorization{
- *this, "vectorization-strategy",
- ::llvm::cl::desc("Set the vectorization strategy"),
- ::llvm::cl::init(mlir::SparseVectorizationStrategy::kNone),
- llvm::cl::values(
- clEnumValN(mlir::SparseVectorizationStrategy::kNone, "none",
- "Turn off sparse vectorization."),
- clEnumValN(mlir::SparseVectorizationStrategy::kDenseInnerLoop,
- "dense-inner-loop",
- "Enable vectorization for dense inner loops."),
- clEnumValN(mlir::SparseVectorizationStrategy::kAnyStorageInnerLoop,
- "any-storage-inner-loop",
- "Enable sparse vectorization for inner loops with any "
- "storage."))};
-
- PassOptions::Option<int32_t> vectorLength{
- *this, "vl", desc("Set the vector length"), init(1)};
- PassOptions::Option<bool> enableSIMDIndex32{
- *this, "enable-simd-index32",
- desc("Enable i32 indexing into vectors (for efficiency)"), init(false)};
- PassOptions::Option<bool> enableVLAVectorization{
- *this, "enable-vla-vectorization",
- desc("Enable vector length agnostic vectorization"), init(false)};
+
PassOptions::Option<bool> enableRuntimeLibrary{
*this, "enable-runtime-library",
desc("Enable runtime library for manipulating sparse tensors"),
/// Projects out the options for `createSparsificationPass`.
SparsificationOptions sparsificationOptions() const {
- return SparsificationOptions(parallelization, vectorization, vectorLength,
- enableSIMDIndex32, enableVLAVectorization,
- enableRuntimeLibrary);
+ return SparsificationOptions(parallelization);
}
// These options must be kept in sync with `SparseTensorConversionBase`.
// TODO: support reduction parallelization too?
};
-/// Defines a vectorization strategy. Any inner loop is a candidate (full SIMD
-/// for parallel loops and horizontal SIMD for reduction loops). A loop is
-/// actually vectorized if (1) allowed by the strategy, and (2) the emitted
-/// code is an actual for-loop (and not a co-iterating while-loop).
-enum class SparseVectorizationStrategy {
- kNone,
- kDenseInnerLoop,
- kAnyStorageInnerLoop
-};
-
#define GEN_PASS_DECL
#include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc"
/// Options for the Sparsification pass.
struct SparsificationOptions {
- SparsificationOptions(SparseParallelizationStrategy p,
- SparseVectorizationStrategy v, unsigned vl, bool e,
- bool vla, bool rt)
- : parallelizationStrategy(p), vectorizationStrategy(v), vectorLength(vl),
- enableSIMDIndex32(e), enableVLAVectorization(vla),
- enableRuntimeLibrary(rt) {}
+ SparsificationOptions(SparseParallelizationStrategy p)
+ : parallelizationStrategy(p) {}
SparsificationOptions()
- : SparsificationOptions(SparseParallelizationStrategy::kNone,
- SparseVectorizationStrategy::kNone, 1u,
- /*enable SIMD Index32=*/false,
- /*enable VLA Vectorization=*/false,
- /*enable runtime library=*/true) {}
+ : SparsificationOptions(SparseParallelizationStrategy::kNone) {}
SparseParallelizationStrategy parallelizationStrategy;
- SparseVectorizationStrategy vectorizationStrategy;
- unsigned vectorLength;
- bool enableSIMDIndex32;
- bool enableVLAVectorization;
- bool enableRuntimeLibrary;
};
/// Sets up sparsification rewriting rules with the given options.
bool enableForeach, bool enableConvert);
std::unique_ptr<Pass> createSparseTensorRewritePass();
-std::unique_ptr<Pass>
-createSparseTensorRewritePass(const SparsificationOptions &options,
- bool enableForeach = true,
- bool enableConvert = true);
+std::unique_ptr<Pass> createSparseTensorRewritePass(bool enableRT,
+ bool enableForeach = true,
+ bool enableConvert = true);
//===----------------------------------------------------------------------===//
// Other rewriting rules and passes.
"memref::MemRefDialect",
"scf::SCFDialect",
"sparse_tensor::SparseTensorDialect",
- "vector::VectorDialect",
];
// TODO(57514): These enum options are duplicated in Passes.h.
let options = [
"Enable dense parallelization for any loop."),
clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
"any-storage-any-loop",
- "Enable sparse parallelization for any storage and loop."))}]>,
- Option<"vectorization", "vectorization-strategy", "mlir::SparseVectorizationStrategy",
- "mlir::SparseVectorizationStrategy::kNone",
- "Set the vectorization strategy", [{llvm::cl::values(
- clEnumValN(mlir::SparseVectorizationStrategy::kNone, "none",
- "Turn off sparse vectorization."),
- clEnumValN(mlir::SparseVectorizationStrategy::kDenseInnerLoop,
- "dense-inner-loop",
- "Enable vectorization for dense inner loops."),
- clEnumValN(mlir::SparseVectorizationStrategy::kAnyStorageInnerLoop,
- "any-storage-inner-loop",
- "Enable sparse vectorization for inner loops with any storage."))}]>,
- Option<"vectorLength", "vl", "int32_t", "1",
- "Set the vector length">,
- Option<"enableSIMDIndex32", "enable-simd-index32", "bool", "false",
- "Enable i32 indexing into vectors (for efficiency)">,
- Option<"enableVLAVectorization", "enable-vla-vectorization", "bool",
- "false", "Enable vector length agnostic vectorization">,
- Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
- "true", "Enable runtime library for manipulating sparse tensors">
+ "Enable sparse parallelization for any storage and loop."))}]>
];
}
"memref::MemRefDialect",
"scf::SCFDialect",
"sparse_tensor::SparseTensorDialect",
- "vector::VectorDialect",
];
let options = [
Option<"sparseToSparse", "s2s-strategy", "int32_t", "0",
/*analysisOnly=*/options.testBufferizationAnalysisOnly)));
if (options.testBufferizationAnalysisOnly)
return;
- pm.addPass(createSparseTensorRewritePass(options.sparsificationOptions()));
+ pm.addPass(createSparseTensorRewritePass(options.enableRuntimeLibrary));
pm.addPass(createSparsificationPass(options.sparsificationOptions()));
if (options.enableRuntimeLibrary)
pm.addPass(createSparseTensorConversionPass(
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/Types.h"
#include "mlir/IR/Value.h"
SparseTensorRewritePass() = default;
SparseTensorRewritePass(const SparseTensorRewritePass &pass) = default;
- SparseTensorRewritePass(const SparsificationOptions &options, bool foreach,
- bool convert) {
- enableRuntimeLibrary = options.enableRuntimeLibrary;
+ SparseTensorRewritePass(bool enableRT, bool foreach, bool convert) {
+ enableRuntimeLibrary = enableRT;
enableForeach = foreach;
enableConvert = convert;
}
SparsificationPass(const SparsificationPass &pass) = default;
SparsificationPass(const SparsificationOptions &options) {
parallelization = options.parallelizationStrategy;
- vectorization = options.vectorizationStrategy;
- vectorLength = options.vectorLength;
- enableSIMDIndex32 = options.enableSIMDIndex32;
- enableVLAVectorization = options.enableVLAVectorization;
- enableRuntimeLibrary = options.enableRuntimeLibrary;
}
void runOnOperation() override {
auto *ctx = &getContext();
// Translate strategy flags to strategy options.
- SparsificationOptions options(parallelization, vectorization, vectorLength,
- enableSIMDIndex32, enableVLAVectorization,
- enableRuntimeLibrary);
+ SparsificationOptions options(parallelization);
// Apply sparsification and vector cleanup rewriting.
RewritePatternSet patterns(ctx);
populateSparsificationPatterns(patterns, options);
return std::make_unique<SparseTensorRewritePass>();
}
-std::unique_ptr<Pass>
-mlir::createSparseTensorRewritePass(const SparsificationOptions &options,
- bool enableForeach, bool enableConvert) {
- return std::make_unique<SparseTensorRewritePass>(options, enableForeach,
+std::unique_ptr<Pass> mlir::createSparseTensorRewritePass(bool enableRT,
+ bool enableForeach,
+ bool enableConvert) {
+ return std::make_unique<SparseTensorRewritePass>(enableRT, enableForeach,
enableConvert);
}
#include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
#include "mlir/Dialect/SparseTensor/Utils/Merger.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/Matchers.h"
#include "mlir/IR/TensorEncoding.h"
#include "llvm/ADT/SmallBitVector.h"
Value expFilled;
Value expAdded;
Value expCount;
- // Current vector length and mask.
- unsigned curVecLength = 1;
- Value curVecMask;
// Topsort (reference should remain in scope).
std::vector<unsigned> &topSort;
};
// Sparse compiler synthesis methods (reductions).
//===----------------------------------------------------------------------===//
-/// Maps reduction kind to vector::CombiningKind.
-static vector::CombiningKind getCombiningKind(Reduction kind) {
- switch (kind) {
- case kNoReduc:
- case kCustom:
- break;
- case kSum:
- return vector::CombiningKind::ADD;
- case kProduct:
- return vector::CombiningKind::MUL;
- case kAnd:
- return vector::CombiningKind::AND;
- case kOr:
- return vector::CombiningKind::OR;
- case kXor:
- return vector::CombiningKind::XOR;
- }
- llvm_unreachable("unknown reduction kind");
-}
-
/// Maps operation to reduction.
static Reduction getReduction(Kind kind) {
switch (kind) {
}
}
-/// Generates an initial value for a vector reduction, following the scheme
-/// given in Chapter 5 of "The Software Vectorization Handbook", where the
-/// initial scalar value is correctly embedded in the vector reduction value,
-/// and a straightforward horizontal reduction will complete the operation.
-static Value genVectorReducInit(CodeGen &codegen, OpBuilder &builder,
- Location loc, VectorType vtp) {
- Value r = codegen.redVal;
- switch (codegen.redKind) {
- case kNoReduc:
- case kCustom:
- break;
- case kSum:
- case kXor:
- // Initialize reduction vector to: | 0 | .. | 0 | r |
- return builder.create<vector::InsertElementOp>(
- loc, r, constantZero(builder, loc, vtp),
- constantIndex(builder, loc, 0));
- case kProduct:
- // Initialize reduction vector to: | 1 | .. | 1 | r |
- return builder.create<vector::InsertElementOp>(
- loc, r, constantOne(builder, loc, vtp), constantIndex(builder, loc, 0));
- case kAnd:
- case kOr:
- // Initialize reduction vector to: | r | .. | r | r |
- return builder.create<vector::BroadcastOp>(loc, vtp, r);
- }
- llvm_unreachable("unknown reduction kind");
-}
-
-/// Generates final value for a vector reduction.
-static Value genVectorReducEnd(CodeGen &codegen, OpBuilder &builder,
- Location loc, VectorType vtp) {
- vector::CombiningKind kind = getCombiningKind(codegen.redKind);
- return builder.create<vector::ReductionOp>(loc, kind, codegen.redVal);
-}
-
/// Updates scalarized reduction value.
static void updateReduc(Merger &merger, CodeGen &codegen, Value reduc) {
assert(codegen.redKind != kNoReduc);
}
}
-/// Constructs vector type.
-static VectorType vectorType(CodeGen &codegen, Type etp) {
- unsigned numScalableDims = codegen.options.enableVLAVectorization;
- return VectorType::get(codegen.curVecLength, etp, numScalableDims);
-}
-
-/// Constructs vector type from pointer.
-static VectorType vectorType(CodeGen &codegen, Value ptr) {
- return vectorType(codegen, ptr.getType().cast<MemRefType>().getElementType());
-}
-
-/// Constructs vector iteration mask.
-static Value genVectorMask(CodeGen &codegen, OpBuilder &builder, Value iv,
- Value lo, Value hi, Value step) {
- Location loc = iv.getLoc();
- VectorType mtp = vectorType(codegen, builder.getI1Type());
- // Special case if the vector length evenly divides the trip count (for
- // example, "for i = 0, 128, 16"). A constant all-true mask is generated
- // so that all subsequent masked memory operations are immediately folded
- // into unconditional memory operations.
- IntegerAttr loInt, hiInt, stepInt;
- if (matchPattern(lo, m_Constant(&loInt)) &&
- matchPattern(hi, m_Constant(&hiInt)) &&
- matchPattern(step, m_Constant(&stepInt))) {
- if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0)
- return builder.create<vector::BroadcastOp>(
- loc, mtp, constantI1(builder, loc, true));
- }
- // Otherwise, generate a vector mask that avoids overrunning the upperbound
- // during vector execution. Here we rely on subsequent loop optimizations to
- // avoid executing the mask in all iterations, for example, by splitting the
- // loop into an unconditional vector loop and a scalar cleanup loop.
- auto minMap = AffineMap::get(
- /*dimCount=*/2, /*symbolCount=*/1,
- {builder.getAffineSymbolExpr(0),
- builder.getAffineDimExpr(0) - builder.getAffineDimExpr(1)},
- builder.getContext());
- Value end =
- builder.createOrFold<AffineMinOp>(loc, minMap, ValueRange{hi, iv, step});
- return builder.create<vector::CreateMaskOp>(loc, mtp, end);
-}
-
-/// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi].
-static Value genVectorLoad(CodeGen &codegen, OpBuilder &builder, Value ptr,
- ArrayRef<Value> args) {
- Location loc = ptr.getLoc();
- VectorType vtp = vectorType(codegen, ptr);
- Value pass = constantZero(builder, loc, vtp);
- if (args.back().getType().isa<VectorType>()) {
- SmallVector<Value, 4> scalarArgs(args.begin(), args.end());
- Value indexVec = args.back();
- scalarArgs.back() = constantIndex(builder, loc, 0);
- return builder.create<vector::GatherOp>(loc, vtp, ptr, scalarArgs, indexVec,
- codegen.curVecMask, pass);
- }
- return builder.create<vector::MaskedLoadOp>(loc, vtp, ptr, args,
- codegen.curVecMask, pass);
-}
-
-/// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs.
-static void genVectorStore(CodeGen &codegen, OpBuilder &builder, Value rhs,
- Value ptr, ArrayRef<Value> args) {
- Location loc = ptr.getLoc();
- if (args.back().getType().isa<VectorType>()) {
- SmallVector<Value, 4> scalarArgs(args.begin(), args.end());
- Value indexVec = args.back();
- scalarArgs.back() = constantIndex(builder, loc, 0);
- builder.create<vector::ScatterOp>(loc, ptr, scalarArgs, indexVec,
- codegen.curVecMask, rhs);
- return;
- }
- builder.create<vector::MaskedStoreOp>(loc, ptr, args, codegen.curVecMask,
- rhs);
-}
-
-/// Generates a vectorized invariant. Here we rely on subsequent loop
-/// optimizations to hoist the invariant broadcast out of the vector loop.
-static Value genVectorInvariantValue(CodeGen &codegen, OpBuilder &builder,
- Value val) {
- VectorType vtp = vectorType(codegen, val.getType());
- return builder.create<vector::BroadcastOp>(val.getLoc(), vtp, val);
-}
-
/// Generates an affine expression.
//
// TODO: generalize for sparse tensor subscripts
linalg::GenericOp op, unsigned exp) {
// Test if the load was hoisted to a higher loop nest.
Value val = merger.exp(exp).val;
- if (val) {
- if (codegen.curVecLength > 1 && !val.getType().isa<VectorType>())
- return genVectorInvariantValue(codegen, builder, val);
+ if (val)
return val;
- }
+
// Load during insertion.
OpOperand &t = op->getOpOperand(merger.exp(exp).tensor);
if (&t == codegen.sparseOut) {
// Actual load.
SmallVector<Value, 4> args;
Value ptr = genSubscript(codegen, builder, op, &t, args);
- if (codegen.curVecLength > 1)
- return genVectorLoad(codegen, builder, ptr, args);
return builder.create<memref::LoadOp>(op.getLoc(), ptr, args);
}
Location loc = op.getLoc();
// Test if this is a scalarized reduction.
if (codegen.redVal) {
- if (codegen.curVecLength > 1)
- rhs = builder.create<arith::SelectOp>(loc, codegen.curVecMask, rhs,
- codegen.redVal);
updateReduc(merger, codegen, rhs);
return;
}
// Actual store.
SmallVector<Value, 4> args;
Value ptr = genSubscript(codegen, builder, op, t, args);
- if (codegen.curVecLength > 1)
- genVectorStore(codegen, builder, rhs, ptr, args);
- else
- builder.create<memref::StoreOp>(loc, rhs, ptr, args);
+ builder.create<memref::StoreOp>(loc, rhs, ptr, args);
}
/// Generates a pointer/index load from the sparse storage scheme. Narrower
/// index type used for looping and indexing.
static Value genLoad(CodeGen &codegen, OpBuilder &builder, Location loc,
Value ptr, Value s) {
- // See https://llvm.org/docs/GetElementPtr.html for some background on
- // the complications described below.
- if (codegen.curVecLength > 1) {
- // Since the index vector is used in a subsequent gather/scatter operations,
- // which effectively defines an unsigned pointer + signed index, we must
- // zero extend the vector to an index width. For 8-bit and 16-bit values,
- // an 32-bit index width suffices. For 32-bit values, zero extending the
- // elements into 64-bit loses some performance since the 32-bit indexed
- // gather/scatter is more efficient than the 64-bit index variant (if the
- // negative 32-bit index space is unused, the enableSIMDIndex32 flag can
- // preserve this performance). For 64-bit values, there is no good way
- // to state that the indices are unsigned, with creates the potential of
- // incorrect address calculations in the unlikely case we need such
- // extremely large offsets.
- Type etp = ptr.getType().cast<MemRefType>().getElementType();
- Value vload = genVectorLoad(codegen, builder, ptr, {s});
- if (!etp.isa<IndexType>()) {
- if (etp.getIntOrFloatBitWidth() < 32)
- vload = builder.create<arith::ExtUIOp>(
- loc, vectorType(codegen, builder.getI32Type()), vload);
- else if (etp.getIntOrFloatBitWidth() < 64 &&
- !codegen.options.enableSIMDIndex32)
- vload = builder.create<arith::ExtUIOp>(
- loc, vectorType(codegen, builder.getI64Type()), vload);
- }
- return vload;
- }
- // For the scalar case, we simply zero extend narrower indices into 64-bit
- // values before casting to index without a performance penalty. Here too,
- // however, indices that already are 64-bit, in theory, cannot express the
- // full range as explained above.
+ // Simply zero extends narrower indices into 64-bit values before casting to
+ // index without a performance penalty.
Value load = builder.create<memref::LoadOp>(loc, ptr, s);
if (!load.getType().isa<IndexType>()) {
if (load.getType().getIntOrFloatBitWidth() < 64)
static Value genInvariantValue(Merger &merger, CodeGen &codegen,
OpBuilder &builder, unsigned exp) {
Value val = merger.exp(exp).val;
- if (codegen.curVecLength > 1)
- return genVectorInvariantValue(codegen, builder, val);
return val;
}
static Value genAddress(CodeGen &codegen, OpBuilder &builder, Location loc,
Value size, Value p, Value i) {
Value mul = builder.create<arith::MulIOp>(loc, size, p);
- if (auto vtp = i.getType().dyn_cast<VectorType>()) {
- Value inv =
- builder.create<arith::IndexCastOp>(loc, vtp.getElementType(), mul);
- mul = genVectorInvariantValue(codegen, builder, inv);
- }
return builder.create<arith::AddIOp>(loc, mul, i);
}
static Value genIndexValue(CodeGen &codegen, OpBuilder &builder, unsigned idx,
unsigned ldx) {
Value ival = codegen.loops[idx];
- Type itype = ival.getType();
- // During vectorization, we either encounter:
- // (1) indices already in vector form, as in ... = ind[lo:hi], good to go, or
- // (2) single index, as in ... = i, must convert to [i, i+1, ...] for inner i.
- unsigned vl = codegen.curVecLength;
- if (vl > 1 && !itype.isa<VectorType>()) {
- Location loc = ival.getLoc();
- VectorType vtp = vectorType(codegen, itype);
- ival = builder.create<vector::BroadcastOp>(loc, vtp, ival);
- if (idx == ldx) {
- Value incr;
- if (vtp.isScalable()) {
- Type stepvty = vectorType(codegen, builder.getI64Type());
- Value stepv = builder.create<LLVM::StepVectorOp>(loc, stepvty);
- incr = builder.create<arith::IndexCastOp>(loc, vtp, stepv);
- } else {
- SmallVector<APInt, 4> integers;
- for (unsigned i = 0; i < vl; i++)
- integers.push_back(APInt(/*width=*/64, i));
- auto values = DenseElementsAttr::get(vtp, integers);
- incr = builder.create<arith::ConstantOp>(loc, vtp, values);
- }
- ival = builder.create<arith::AddIOp>(loc, ival, incr);
- }
- }
return ival;
}
return needsUniv;
}
-/// Returns vectorization strategy. Any implicit inner loop in the Linalg
-/// operation is a candidate. Whether it is actually converted to SIMD code
-/// depends on the requested strategy.
-static bool isVectorFor(CodeGen &codegen, bool isInner, bool isReduction,
- bool isSparse) {
- // Reject vectorization of sparse output, unless innermost is reduction.
- if (codegen.sparseOut && !isReduction)
- return false;
- // Inspect strategy.
- switch (codegen.options.vectorizationStrategy) {
- case SparseVectorizationStrategy::kNone:
- return false;
- case SparseVectorizationStrategy::kDenseInnerLoop:
- return isInner && !isSparse;
- case SparseVectorizationStrategy::kAnyStorageInnerLoop:
- return isInner;
- }
- llvm_unreachable("unexpected vectorization strategy");
-}
-
/// Returns parallelization strategy. Any implicit loop in the Linalg operation
/// that is marked "parallel" is a candidate. Whether it is actually converted
/// to a parallel operation depends on the requested strategy.
static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
- bool isSparse, bool isVector) {
+ bool isSparse) {
// Reject parallelization of sparse output.
if (codegen.sparseOut)
return false;
case SparseParallelizationStrategy::kNone:
return false;
case SparseParallelizationStrategy::kDenseOuterLoop:
- return isOuter && !isSparse && !isReduction && !isVector;
+ return isOuter && !isSparse && !isReduction;
case SparseParallelizationStrategy::kAnyStorageOuterLoop:
- return isOuter && !isReduction && !isVector;
+ return isOuter && !isReduction;
case SparseParallelizationStrategy::kDenseAnyLoop:
- return !isSparse && !isReduction && !isVector;
+ return !isSparse && !isReduction;
case SparseParallelizationStrategy::kAnyStorageAnyLoop:
- return !isReduction && !isVector;
+ return !isReduction;
}
llvm_unreachable("unexpected parallelization strategy");
}
-/// Checks unit stride for dense tensors. The iteration graph may have ignored
-/// dense access patterns in order to avoid cycles (sparse access patterns are
-/// always placed innermost), but that means dense access has become strided.
-/// This prevents effective vectorization.
-static bool denseUnitStrides(Merger &merger, linalg::GenericOp op,
- unsigned idx) {
- for (OpOperand &t : op->getOpOperands()) {
- if (!getSparseTensorEncoding(t.get().getType())) {
- auto map = op.getMatchingIndexingMap(&t);
- for (unsigned d = 0, rank = map.getNumResults(); d < rank; d++) {
- AffineExpr a = map.getResult(d);
- // Report non-unit stride if innermost index appears at an outer
- // dimension (true non-unit stride) or if the innermost index appears
- // in a compound subscript in the innermost dimension. Even if the
- // latter is unit stride, it does not play well with scatter/gather.
- // TODO: accept unit stride affine innermost like a[i,j+k+1]?
- if (a.isFunctionOfDim(idx) &&
- ((d != rank - 1) || (a.getKind() != AffineExprKind::DimId)))
- return false;
- }
- }
- }
- return true;
-}
-
/// Generates a for-loop on a single index.
static Operation *genFor(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op, bool isOuter, bool isInner,
bool isReduction = linalg::isReductionIterator(iteratorTypes[idx]);
bool isSparse = isCompressedDLT(merger.getDimLevelType(fb)) ||
isSingletonDLT(merger.getDimLevelType(fb));
- bool isVector = isVectorFor(codegen, isInner, isReduction, isSparse) &&
- denseUnitStrides(merger, op, idx);
- bool isParallel =
- isParallelFor(codegen, isOuter, isReduction, isSparse, isVector);
-
- // Prepare vector length.
- if (isVector)
- codegen.curVecLength = codegen.options.vectorLength;
+ bool isParallel = isParallelFor(codegen, isOuter, isReduction, isSparse);
// Loop bounds and increment.
Location loc = op.getLoc();
Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx];
Value hi = isSparse ? codegen.highs[tensor][idx] : codegen.sizes[idx];
- Value step = constantIndex(builder, loc, codegen.curVecLength);
- if (isVector && codegen.options.enableVLAVectorization) {
- Value vscale = builder.create<vector::VectorScaleOp>(
- loc, IndexType::get(builder.getContext()));
- step = builder.create<arith::MulIOp>(loc, vscale, step);
- }
+ Value step = constantIndex(builder, loc, 1);
// Emit a parallel loop.
if (isParallel) {
- assert(!isVector);
scf::ParallelOp parOp = builder.create<scf::ParallelOp>(loc, lo, hi, step);
if (isSparse)
codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0];
// Emit a sequential or vector loop.
SmallVector<Value, 4> operands;
- if (codegen.redVal) {
- // In a vector loop, bring reduction into SIMD form, if not already.
- if (isVector && !codegen.redVal.getType().isa<VectorType>()) {
- VectorType vtp = vectorType(codegen, codegen.redVal.getType());
- Value vred = genVectorReducInit(codegen, builder, loc, vtp);
- updateReduc(merger, codegen, vred);
- }
+ if (codegen.redVal)
operands.push_back(codegen.redVal);
- }
if (codegen.expValues)
operands.push_back(codegen.expCount);
+
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, operands);
+
if (codegen.redVal)
updateReduc(merger, codegen, forOp.getRegionIterArgs().front());
if (codegen.expValues)
codegen.pidxs[tensor][idx] = iv;
else
codegen.loops[idx] = iv;
+
builder.setInsertionPointToStart(forOp.getBody());
- // Share vector iteration mask between all subsequent loads/stores.
- if (isVector)
- codegen.curVecMask = genVectorMask(codegen, builder, iv, lo, hi, step);
return forOp;
}
static bool startLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op, unsigned exp, unsigned at,
unsigned idx, unsigned ldx, unsigned lts) {
- assert(codegen.curVecLength == 1);
assert(!codegen.loops[idx]);
// Emit invariants at this loop sequence level.
genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/true);
static Operation *startLoop(Merger &merger, CodeGen &codegen,
OpBuilder &builder, linalg::GenericOp op,
unsigned at, unsigned li, bool needsUniv) {
- assert(codegen.curVecLength == 1);
// Emit the for/while-loop control.
Operation *loop = genLoop(merger, codegen, builder, op, at, needsUniv,
merger.lat(li).simple);
static bool endLoop(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op, Operation *loop, unsigned idx,
unsigned li, bool needsUniv) {
- codegen.curVecLength = 1;
// End a while-loop.
if (auto whileOp = dyn_cast<scf::WhileOp>(loop)) {
genWhileInduction(merger, codegen, builder, op, idx, needsUniv,
static void endLoopSeq(Merger &merger, CodeGen &codegen, OpBuilder &builder,
linalg::GenericOp op, unsigned exp, unsigned at,
unsigned idx, unsigned ldx) {
- assert(codegen.curVecLength == 1);
assert(codegen.loops[idx]);
codegen.loops[idx] = Value();
- // Bring a pending reduction back from SIMD form when sequence ends.
- if (codegen.redVal)
- if (auto vtp = codegen.redVal.getType().dyn_cast<VectorType>())
- updateReduc(merger, codegen,
- genVectorReducEnd(codegen, builder, op.getLoc(), vtp));
// Unmark bookkeeping of invariants and loop index.
genInvariants(merger, codegen, builder, op, exp, ldx, /*atStart=*/false);
// Finalize access pattern expansion for sparse tensor output.
// RUN: mlir-opt %s -sparsification="parallelization-strategy=none" | \
// RUN: FileCheck %s --check-prefix=CHECK-PAR0
-// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
-// RUN: FileCheck %s --check-prefix=CHECK-PAR1
-// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
-// RUN: FileCheck %s --check-prefix=CHECK-PAR2
-// RUN: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
-// RUN: FileCheck %s --check-prefix=CHECK-PAR3
-// RUN: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
-// RUN: FileCheck %s --check-prefix=CHECK-PAR4
+// FIXME: we do not support vectorization/parallel loops in loop emitter right now
+// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-outer-loop" | \
+// R_U_N: FileCheck %s --check-prefix=CHECK-PAR1
+// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-outer-loop" | \
+// R_U_N: FileCheck %s --check-prefix=CHECK-PAR2
+// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=dense-any-loop" | \
+// R_U_N: FileCheck %s --check-prefix=CHECK-PAR3
+// R_U_N: mlir-opt %s -sparsification="parallelization-strategy=any-storage-any-loop" | \
+// R_U_N: FileCheck %s --check-prefix=CHECK-PAR4
#DenseMatrix = #sparse_tensor.encoding<{
dimLevelType = [ "dense", "dense" ]
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=none vl=16" -cse -split-input-file | \
-// RUN: FileCheck %s --check-prefix=CHECK-VEC0
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=dense-inner-loop vl=16" -cse -split-input-file | \
-// RUN: FileCheck %s --check-prefix=CHECK-VEC1
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=any-storage-inner-loop vl=16" -cse -split-input-file | \
-// RUN: FileCheck %s --check-prefix=CHECK-VEC2
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=any-storage-inner-loop vl=16 enable-simd-index32=true" -cse -split-input-file | \
-// RUN: FileCheck %s --check-prefix=CHECK-VEC3
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=any-storage-inner-loop vl=4 enable-vla-vectorization=true" -cse -split-input-file | \
-// RUN: FileCheck %s --check-prefix=CHECK-VEC4
+// RUN: mlir-opt %s -sparsification -cse -split-input-file | \
+// RUN: FileCheck %s
#DenseVector = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }>
}
//
-// CHECK-VEC0-LABEL: func @scale_d
-// CHECK-VEC0-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC0-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC0-DAG: %[[c1024:.*]] = arith.constant 1024 : index
-// CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c1]] {
-// CHECK-VEC0: %[[l:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
-// CHECK-VEC0: %[[m:.*]] = arith.mulf %[[l]], %{{.*}} : f32
-// CHECK-VEC0: store %[[m]], %{{.*}}[%[[i]]] : memref<1024xf32>
-// CHECK-VEC0: }
-// CHECK-VEC0: return
-//
-// CHECK-VEC1-LABEL: func @scale_d
-// CHECK-VEC1-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC1-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK-VEC1-DAG: %[[c1024:.*]] = arith.constant 1024 : index
-// CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] {
-// CHECK-VEC1: %[[r:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
-// CHECK-VEC1: %[[b:.*]] = vector.broadcast %{{.*}} : f32 to vector<16xf32>
-// CHECK-VEC1: %[[m:.*]] = arith.mulf %[[r]], %[[b]] : vector<16xf32>
-// CHECK-VEC1: vector.store %[[m]], %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
-// CHECK-VEC1: }
-// CHECK-VEC1: return
-//
-// CHECK-VEC2-LABEL: func @scale_d
-// CHECK-VEC2-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC2-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK-VEC2-DAG: %[[c1024:.*]] = arith.constant 1024 : index
-// CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] {
-// CHECK-VEC2: %[[r:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
-// CHECK-VEC2: %[[b:.*]] = vector.broadcast %{{.*}} : f32 to vector<16xf32>
-// CHECK-VEC2: %[[m:.*]] = arith.mulf %[[r]], %[[b]] : vector<16xf32>
-// CHECK-VEC2: vector.store %[[m]], %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
-// CHECK-VEC2: }
-// CHECK-VEC2: return
-//
-// CHECK-VEC4: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)
-// CHECK-VEC4-LABEL: func @scale_d
-// CHECK-VEC4-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC4-DAG: %[[c4:.*]] = arith.constant 4 : index
-// CHECK-VEC4-DAG: %[[c1024:.*]] = arith.constant 1024 : index
-// CHECK-VEC4-DAG: %[[v0:.*]] = arith.constant dense<0.000000e+00> : vector<[4]xf32>
-// CHECK-VEC4-DAG: %[[vscale:.*]] = vector.vscale
-// CHECK-VEC4: %[[step:.*]] = arith.muli %[[vscale]], %[[c4]] : index
-// CHECK-VEC4: scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[step]] {
-// CHECK-VEC4: %[[sub:.*]] = affine.min #[[$map]](%[[c1024]], %[[i]])[%[[step]]]
-// CHECK-VEC4: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<[4]xi1>
-// CHECK-VEC4: %[[val:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %[[v0]] : memref<?xf32>, vector<[4]xi1>, vector<[4]xf32> into vector<[4]xf32>
-// CHECK-VEC4: %[[scalev:.*]] = vector.broadcast %{{.*}} : f32 to vector<[4]xf32>
-// CHECK-VEC4: %[[scaled:.*]] = arith.mulf %[[val]], %[[scalev]] : vector<[4]xf32>
-// CHECK-VEC4: vector.maskedstore %{{.*}}[%[[i]]], %[[mask]], %[[scaled]] : memref<1024xf32>, vector<[4]xi1>, vector<[4]xf32>
-// CHECK-VEC4: }
-// CHECK-VEC4: return
+// CHECK-LABEL: func @scale_d
+// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[c1024:.*]] = arith.constant 1024 : index
+// CHECK: scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c1]] {
+// CHECK: %[[l:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
+// CHECK: %[[m:.*]] = arith.mulf %[[l]], %{{.*}} : f32
+// CHECK: store %[[m]], %{{.*}}[%[[i]]] : memref<1024xf32>
+// CHECK: }
+// CHECK: return
//
+
func.func @scale_d(%arga: tensor<1024xf32, #DenseVector>, %b: f32, %argx: tensor<1024xf32>) -> tensor<1024xf32> {
%0 = linalg.generic #trait_scale_d
ins(%arga: tensor<1024xf32, #DenseVector>)
}
//
-// CHECK-VEC0-LABEL: func @mul_s
-// CHECK-VEC0-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC0-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC0: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC0: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC0: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC0: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC0: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC0: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC0: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
-// CHECK-VEC0: %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC0: %[[zi:.*]] = arith.extui %[[li]] : i32 to i64
-// CHECK-VEC0: %[[ci:.*]] = arith.index_cast %[[zi]] : i64 to index
-// CHECK-VEC0: %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
-// CHECK-VEC0: %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32>
-// CHECK-VEC0: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : f32
-// CHECK-VEC0: store %[[m]], %{{.*}}[%[[ci]]] : memref<1024xf32>
-// CHECK-VEC0: }
-// CHECK-VEC0: return
-//
-// CHECK-VEC1-LABEL: func @mul_s
-// CHECK-VEC1-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC1-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC1: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC1: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC1: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC1: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC1: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC1: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC1: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
-// CHECK-VEC1: %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC1: %[[zi:.*]] = arith.extui %[[li]] : i32 to i64
-// CHECK-VEC1: %[[ci:.*]] = arith.index_cast %[[zi]] : i64 to index
-// CHECK-VEC1: %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
-// CHECK-VEC1: %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32>
-// CHECK-VEC1: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : f32
-// CHECK-VEC1: store %[[m]], %{{.*}}[%[[ci]]] : memref<1024xf32>
-// CHECK-VEC1: }
-// CHECK-VEC1: return
-//
-// CHECK-VEC2: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
-// CHECK-VEC2-LABEL: func @mul_s
-// CHECK-VEC2-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC2-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC2-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK-VEC2: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC2: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC2: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC2: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC2: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC2: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC2: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
-// CHECK-VEC2: %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[i]])[%[[c16]]]
-// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
-// CHECK-VEC2: %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
-// CHECK-VEC2: %[[zi:.*]] = arith.extui %[[li]] : vector<16xi32> to vector<16xi64>
-// CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2: vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
-// CHECK-VEC2: }
-// CHECK-VEC2: return
-//
-// CHECK-VEC3: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
-// CHECK-VEC3-LABEL: func @mul_s
-// CHECK-VEC3-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC3-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC3-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK-VEC3: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC3: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC3: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC3: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC3: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC3: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC3: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
-// CHECK-VEC3: %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[i]])[%[[c16]]]
-// CHECK-VEC3: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
-// CHECK-VEC3: %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
-// CHECK-VEC3: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC3: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC3: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC3: vector.scatter %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
-// CHECK-VEC3: }
-// CHECK-VEC3: return
-//
-// CHECK-VEC4: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)
-// CHECK-VEC4-LABEL: func @mul_s
-// CHECK-VEC4-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC4-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC4-DAG: %[[c4:.*]] = arith.constant 4 : index
-// CHECK-VEC4-DAG: %[[v0i:.*]] = arith.constant dense<0> : vector<[4]xi32>
-// CHECK-VEC4-DAG: %[[v0f:.*]] = arith.constant dense<0.000000e+00> : vector<[4]xf32>
-// CHECK-VEC4: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC4: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC4: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC4: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC4: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC4: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC4: %[[vscale:.*]] = vector.vscale
-// CHECK-VEC4: %[[step:.*]] = arith.muli %[[vscale]], %[[c4]] : index
-// CHECK-VEC4: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[step]] {
-// CHECK-VEC4: %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[i]])[%[[step]]]
-// CHECK-VEC4: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<[4]xi1>
-// CHECK-VEC4: %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %[[v0i]] : memref<?xi32>, vector<[4]xi1>, vector<[4]xi32> into vector<[4]xi32>
-// CHECK-VEC4: %[[lii64:.*]] = arith.extui %[[li]] : vector<[4]xi32> to vector<[4]xi64>
-// CHECK-VEC4: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %[[v0f]] : memref<?xf32>, vector<[4]xi1>, vector<[4]xf32> into vector<[4]xf32>
-// CHECK-VEC4: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[lii64]]], %[[mask]], %[[v0f]] : memref<1024xf32>, vector<[4]xi64>, vector<[4]xi1>, vector<[4]xf32> into vector<[4]xf32>
-// CHECK-VEC4: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<[4]xf32>
-// CHECK-VEC4: vector.scatter %{{.*}}[%[[c0]]] [%[[lii64]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<[4]xi64>, vector<[4]xi1>, vector<[4]xf32>
-// CHECK-VEC4: }
-// CHECK-VEC4: return
+// CHECK-LABEL: func @mul_s
+// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
+// CHECK: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
+// CHECK: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
+// CHECK: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
+// CHECK: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
+// CHECK: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
+// CHECK: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
+// CHECK: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
+// CHECK: %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
+// CHECK: %[[zi:.*]] = arith.extui %[[li]] : i32 to i64
+// CHECK: %[[ci:.*]] = arith.index_cast %[[zi]] : i64 to index
+// CHECK: %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
+// CHECK: %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32>
+// CHECK: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : f32
+// CHECK: store %[[m]], %{{.*}}[%[[ci]]] : memref<1024xf32>
+// CHECK: }
+// CHECK: return
//
func.func @mul_s(%arga: tensor<1024xf32, #SparseVector>, %argb: tensor<1024xf32>, %argx: tensor<1024xf32>) -> tensor<1024xf32> {
%0 = linalg.generic #trait_mul_s
}
//
-// CHECK-VEC0-LABEL: func @reduction_d
-// CHECK-VEC0-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC0-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC0-DAG: %[[c1024:.*]] = arith.constant 1024 : index
-// CHECK-VEC0: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c1]] iter_args(%[[red_in:.*]] = %{{.*}}) -> (f32) {
-// CHECK-VEC0: %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
-// CHECK-VEC0: %[[lb:.*]] = memref.load %{{.*}}[%[[i]]] : memref<1024xf32>
-// CHECK-VEC0: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : f32
-// CHECK-VEC0: %[[a:.*]] = arith.addf %[[red_in]], %[[m]] : f32
-// CHECK-VEC0: scf.yield %[[a]] : f32
-// CHECK-VEC0: }
-// CHECK-VEC0: return
-//
-// CHECK-VEC1-LABEL: func @reduction_d
-// CHECK-VEC1-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC1-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK-VEC1-DAG: %[[c1024:.*]] = arith.constant 1024 : index
-// CHECK-VEC1-DAG: %[[v0:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-VEC1: %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
-// CHECK-VEC1: %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[c0]] : index] : vector<16xf32>
-// CHECK-VEC1: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
-// CHECK-VEC1: %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
-// CHECK-VEC1: %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
-// CHECK-VEC1: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC1: %[[a:.*]] = arith.addf %[[red_in]], %[[m]] : vector<16xf32>
-// CHECK-VEC1: scf.yield %[[a]] : vector<16xf32>
-// CHECK-VEC1: }
-// CHECK-VEC1: %{{.*}} = vector.reduction <add>, %[[red]] : vector<16xf32> into f32
-// CHECK-VEC1: return
-//
-// CHECK-VEC2-LABEL: func @reduction_d
-// CHECK-VEC2-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC2-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK-VEC2-DAG: %[[c1024:.*]] = arith.constant 1024 : index
-// CHECK-VEC2-DAG: %[[v0:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
-// CHECK-VEC2: %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
-// CHECK-VEC2: %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[c0]] : index] : vector<16xf32>
-// CHECK-VEC2: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
-// CHECK-VEC2: %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
-// CHECK-VEC2: %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
-// CHECK-VEC2: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2: %[[a:.*]] = arith.addf %[[red_in]], %[[m]] : vector<16xf32>
-// CHECK-VEC2: scf.yield %[[a]] : vector<16xf32>
-// CHECK-VEC2: }
-// CHECK-VEC2: %{{.*}} = vector.reduction <add>, %[[red]] : vector<16xf32> into f32
-// CHECK-VEC2: return
-//
-// CHECK-VEC4: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)
-// CHECK-VEC4-LABEL: func @reduction_d
-// CHECK-VEC4-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC4-DAG: %[[c4:.*]] = arith.constant 4 : index
-// CHECK-VEC4-DAG: %[[c1024:.*]] = arith.constant 1024 : index
-// CHECK-VEC4-DAG: %[[v0:.*]] = arith.constant dense<0.000000e+00> : vector<[4]xf32>
-// CHECK-VEC4: %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
-// CHECK-VEC4: %[[vscale:.*]] = vector.vscale
-// CHECK-VEC4: %[[step:.*]] = arith.muli %[[vscale]], %[[c4]] : index
-// CHECK-VEC4: %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[c0]] : index] : vector<[4]xf32>
-// CHECK-VEC4: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[step]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<[4]xf32>) {
-// CHECK-VEC4: %[[sub:.*]] = affine.min #[[$map]](%[[c1024]], %[[i]])[%[[step]]]
-// CHECK-VEC4: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<[4]xi1>
-// CHECK-VEC4: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %[[v0]] : memref<?xf32>, vector<[4]xi1>, vector<[4]xf32> into vector<[4]xf32>
-// CHECK-VEC4: %[[lb:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %[[v0]] : memref<1024xf32>, vector<[4]xi1>, vector<[4]xf32> into vector<[4]xf32>
-// CHECK-VEC4: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<[4]xf32>
-// CHECK-VEC4: %[[a:.*]] = arith.addf %[[red_in]], %[[m]] : vector<[4]xf32>
-// CHECK-VEC4: %[[sa:.*]] = arith.select %[[mask]], %[[a]], %[[red_in]] : vector<[4]xi1>, vector<[4]xf32>
-// CHECK-VEC4: scf.yield %[[sa]] : vector<[4]xf32>
-// CHECK-VEC4: }
-// CHECK-VEC4: %{{.*}} = vector.reduction <add>, %[[red]] : vector<[4]xf32> into f32
-// CHECK-VEC4: return
+// CHECK-LABEL: func @reduction_d
+// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[c1024:.*]] = arith.constant 1024 : index
+// CHECK: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c1]] iter_args(%[[red_in:.*]] = %{{.*}}) -> (f32) {
+// CHECK: %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
+// CHECK: %[[lb:.*]] = memref.load %{{.*}}[%[[i]]] : memref<1024xf32>
+// CHECK: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : f32
+// CHECK: %[[a:.*]] = arith.addf %[[red_in]], %[[m]] : f32
+// CHECK: scf.yield %[[a]] : f32
+// CHECK: }
+// CHECK: return
//
func.func @reduction_d(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32>, %argx: tensor<f32>) -> tensor<f32> {
%0 = linalg.generic #trait_reduction_d
}
//
-// CHECK-VEC0-LABEL: func @mul_ds
-// CHECK-VEC0-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC0-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC0-DAG: %[[c512:.*]] = arith.constant 512 : index
-// CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
-// CHECK-VEC0: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC0: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC0: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC0: %[[a:.*]] = arith.addi %[[i]], %[[c1]] : index
-// CHECK-VEC0: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC0: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC0: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC0: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
-// CHECK-VEC0: %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xi32>
-// CHECK-VEC0: %[[zj:.*]] = arith.extui %[[lj]] : i32 to i64
-// CHECK-VEC0: %[[cj:.*]] = arith.index_cast %[[zj]] : i64 to index
-// CHECK-VEC0: %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xf32>
-// CHECK-VEC0: %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
-// CHECK-VEC0: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : f32
-// CHECK-VEC0: store %[[m]], %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
-// CHECK-VEC0: }
-// CHECK-VEC0: }
-// CHECK-VEC0: return
-//
-// CHECK-VEC1-LABEL: func @mul_ds
-// CHECK-VEC1-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC1-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC1-DAG: %[[c512:.*]] = arith.constant 512 : index
-// CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
-// CHECK-VEC1: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC1: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC1: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC1: %[[a:.*]] = arith.addi %[[i]], %[[c1]] : index
-// CHECK-VEC1: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC1: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC1: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC1: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
-// CHECK-VEC1: %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xi32>
-// CHECK-VEC1: %[[zj:.*]] = arith.extui %[[lj]] : i32 to i64
-// CHECK-VEC1: %[[cj:.*]] = arith.index_cast %[[zj]] : i64 to index
-// CHECK-VEC1: %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xf32>
-// CHECK-VEC1: %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
-// CHECK-VEC1: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : f32
-// CHECK-VEC1: store %[[m]], %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
-// CHECK-VEC1: }
-// CHECK-VEC1: }
-// CHECK-VEC1: return
-//
-// CHECK-VEC2: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
-// CHECK-VEC2-LABEL: func @mul_ds
-// CHECK-VEC2-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC2-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC2-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK-VEC2-DAG: %[[c512:.*]] = arith.constant 512 : index
-// CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
-// CHECK-VEC2: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC2: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC2: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC2: %[[a:.*]] = arith.addi %[[i]], %[[c1]] : index
-// CHECK-VEC2: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC2: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC2: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC2: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
-// CHECK-VEC2: %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[j]])[%[[c16]]]
-// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
-// CHECK-VEC2: %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
-// CHECK-VEC2: %[[zj:.*]] = arith.extui %[[lj]] : vector<16xi32> to vector<16xi64>
-// CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2: vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
-// CHECK-VEC2: }
-// CHECK-VEC2: }
-// CHECK-VEC2: return
-//
-// CHECK-VEC3: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
-// CHECK-VEC3-LABEL: func @mul_ds
-// CHECK-VEC3-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC3-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC3-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK-VEC3-DAG: %[[c512:.*]] = arith.constant 512 : index
-// CHECK-VEC3: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
-// CHECK-VEC3: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC3: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC3: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC3: %[[a:.*]] = arith.addi %[[i]], %[[c1]] : index
-// CHECK-VEC3: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC3: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC3: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC3: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
-// CHECK-VEC3: %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[j]])[%[[c16]]]
-// CHECK-VEC3: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
-// CHECK-VEC3: %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
-// CHECK-VEC3: %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC3: %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC3: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC3: vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
-// CHECK-VEC3: }
-// CHECK-VEC3: }
-// CHECK-VEC3: return
-//
-// CHECK-VEC4: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)
-// CHECK-VEC4-LABEL: func @mul_ds
-// CHECK-VEC4-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC4-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC4-DAG: %[[c4:.*]] = arith.constant 4 : index
-// CHECK-VEC4-DAG: %[[c512:.*]] = arith.constant 512 : index
-// CHECK-VEC4-DAG: %[[v0i:.*]] = arith.constant dense<0> : vector<[4]xi32>
-// CHECK-VEC4-DAG: %[[v0f:.*]] = arith.constant dense<0.000000e+00> : vector<[4]xf32>
-// CHECK-VEC4: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
-// CHECK-VEC4: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC4: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK-VEC4: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK-VEC4: %[[a:.*]] = arith.addi %[[i]], %[[c1]] : index
-// CHECK-VEC4: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC4: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK-VEC4: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK-VEC4: %[[vscale:.*]] = vector.vscale
-// CHECK-VEC4: %[[step:.*]] = arith.muli %[[vscale]], %[[c4]] : index
-// CHECK-VEC4: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[step]] {
-// CHECK-VEC4: %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[j]])[%[[step]]]
-// CHECK-VEC4: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<[4]xi1>
-// CHECK-VEC4: %[[lji32:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %[[v0i]] : memref<?xi32>, vector<[4]xi1>, vector<[4]xi32> into vector<[4]xi32>
-// CHECK-VEC4: %[[lj:.*]] = arith.extui %[[lji32]] : vector<[4]xi32> to vector<[4]xi64>
-// CHECK-VEC4: %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %[[v0f]] : memref<?xf32>, vector<[4]xi1>, vector<[4]xf32> into vector<[4]xf32>
-// CHECK-VEC4: %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %[[v0f]] : memref<512x1024xf32>, vector<[4]xi64>, vector<[4]xi1>, vector<[4]xf32> into vector<[4]xf32>
-// CHECK-VEC4: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<[4]xf32>
-// CHECK-VEC4: vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<[4]xi64>, vector<[4]xi1>, vector<[4]xf32>
-// CHECK-VEC4: }
-// CHECK-VEC4: }
-// CHECK-VEC4: return
+// CHECK-LABEL: func @mul_ds
+// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[c512:.*]] = arith.constant 512 : index
+// CHECK: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
+// CHECK: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
+// CHECK: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
+// CHECK: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
+// CHECK: %[[a:.*]] = arith.addi %[[i]], %[[c1]] : index
+// CHECK: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
+// CHECK: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
+// CHECK: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
+// CHECK: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
+// CHECK: %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xi32>
+// CHECK: %[[zj:.*]] = arith.extui %[[lj]] : i32 to i64
+// CHECK: %[[cj:.*]] = arith.index_cast %[[zj]] : i64 to index
+// CHECK: %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xf32>
+// CHECK: %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
+// CHECK: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : f32
+// CHECK: store %[[m]], %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
+// CHECK: }
+// CHECK: }
+// CHECK: return
//
func.func @mul_ds(%arga: tensor<512x1024xf32, #SparseMatrix>, %argb: tensor<512x1024xf32>, %argx: tensor<512x1024xf32>) -> tensor<512x1024xf32> {
%0 = linalg.generic #trait_mul_ds
}
//
-// CHECK-VEC0-LABEL: func @add_dense
-// CHECK-VEC0-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC0-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC0-DAG: %[[c32:.*]] = arith.constant 32 : index
-// CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
-// CHECK-VEC0: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
-// CHECK-VEC0: %[[i1:.*]] = arith.addi %[[i]], %[[c1]] : index
-// CHECK-VEC0: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
-// CHECK-VEC0: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c1]] {
-// CHECK-VEC0: %[[j:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xindex>
-// CHECK-VEC0: %[[x:.*]] = memref.load %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
-// CHECK-VEC0: %[[a:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xf64>
-// CHECK-VEC0: %[[s:.*]] = arith.addf %[[x]], %[[a]] : f64
-// CHECK-VEC0: memref.store %[[s]], %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
-// CHECK-VEC0: }
-// CHECK-VEC0: }
-// CHECK-VEC0: return
-//
-// CHECK-VEC1-LABEL: func @add_dense
-// CHECK-VEC1-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC1-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC1-DAG: %[[c32:.*]] = arith.constant 32 : index
-// CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
-// CHECK-VEC1: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
-// CHECK-VEC1: %[[i1:.*]] = arith.addi %[[i]], %[[c1]] : index
-// CHECK-VEC1: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
-// CHECK-VEC1: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c1]] {
-// CHECK-VEC1: %[[j:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xindex>
-// CHECK-VEC1: %[[x:.*]] = memref.load %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
-// CHECK-VEC1: %[[a:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xf64>
-// CHECK-VEC1: %[[s:.*]] = arith.addf %[[x]], %[[a]] : f64
-// CHECK-VEC1: memref.store %[[s]], %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
-// CHECK-VEC1: }
-// CHECK-VEC1: }
-// CHECK-VEC1: return
-//
-// CHECK-VEC2: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
-// CHECK-VEC2-LABEL: func @add_dense
-// CHECK-VEC2-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC2-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC2-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK-VEC2-DAG: %[[c32:.*]] = arith.constant 32 : index
-// CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
-// CHECK-VEC2: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
-// CHECK-VEC2: %[[i1:.*]] = arith.addi %[[i]], %[[c1]] : index
-// CHECK-VEC2: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
-// CHECK-VEC2: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c16]] {
-// CHECK-VEC2: %[[sub:.*]] = affine.min #[[$map]](%[[hi]], %[[jj]])[%[[c16]]]
-// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
-// CHECK-VEC2: %[[j:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %{{.*}} : memref<?xindex>
-// CHECK-VEC2: %[[x:.*]] = vector.gather %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %{{.*}} : memref<33x64xf64>
-// CHECK-VEC2: %[[a:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %{{.*}} : memref<?xf64>
-// CHECK-VEC2: %[[s:.*]] = arith.addf %[[x]], %[[a]] : vector<16xf64>
-// CHECK-VEC2: vector.scatter %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %[[s]] : memref<33x64xf64>
-// CHECK-VEC2: }
-// CHECK-VEC2: }
-// CHECK-VEC2: return
-//
-// CHECK-VEC4: #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)
-// CHECK-VEC4-LABEL: func @add_dense
-// CHECK-VEC4-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-VEC4-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-VEC4-DAG: %[[c4:.*]] = arith.constant 4 : index
-// CHECK-VEC4-DAG: %[[c32:.*]] = arith.constant 32 : index
-// CHECK-VEC4-DAG: %[[v0idx:.*]] = arith.constant dense<0> : vector<[4]xindex>
-// CHECK-VEC4-DAG: %[[v0f64:.*]] = arith.constant dense<0.000000e+00> : vector<[4]xf64>
-// CHECK-VEC4: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
-// CHECK-VEC4: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
-// CHECK-VEC4: %[[i1:.*]] = arith.addi %[[i]], %[[c1]] : index
-// CHECK-VEC4: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
-// CHECK-VEC4: %[[vscale:.*]] = vector.vscale
-// CHECK-VEC4: %[[step:.*]] = arith.muli %[[vscale]], %[[c4]] : index
-// CHECK-VEC4: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[step]] {
-// CHECK-VEC4: %[[sub:.*]] = affine.min #[[$map]](%[[hi]], %[[jj]])[%[[step]]]
-// CHECK-VEC4: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<[4]xi1>
-// CHECK-VEC4: %[[j:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %[[v0idx]] : memref<?xindex>
-// CHECK-VEC4: %[[x:.*]] = vector.gather %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %[[v0f64]] : memref<33x64xf64>
-// CHECK-VEC4: %[[a:.*]] = vector.maskedload %{{.*}}[%[[jj]]], %[[mask]], %[[v0f64]] : memref<?xf64>
-// CHECK-VEC4: %[[s:.*]] = arith.addf %[[x]], %[[a]] : vector<[4]xf64>
-// CHECK-VEC4: vector.scatter %{{.*}}[%[[i1]], %[[c0]]] [%[[j]]], %[[mask]], %[[s]] : memref<33x64xf64>
-// CHECK-VEC4: }
-// CHECK-VEC4: }
-// CHECK-VEC4: return
+// CHECK-LABEL: func @add_dense
+// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[c32:.*]] = arith.constant 32 : index
+// CHECK: scf.for %[[i:.*]] = %[[c0]] to %[[c32]] step %[[c1]] {
+// CHECK: %[[lo:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xindex>
+// CHECK: %[[i1:.*]] = arith.addi %[[i]], %[[c1]] : index
+// CHECK: %[[hi:.*]] = memref.load %{{.*}}[%[[i1]]] : memref<?xindex>
+// CHECK: scf.for %[[jj:.*]] = %[[lo]] to %[[hi]] step %[[c1]] {
+// CHECK: %[[j:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xindex>
+// CHECK: %[[x:.*]] = memref.load %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
+// CHECK: %[[a:.*]] = memref.load %{{.*}}[%[[jj]]] : memref<?xf64>
+// CHECK: %[[s:.*]] = arith.addf %[[x]], %[[a]] : f64
+// CHECK: memref.store %[[s]], %{{.*}}[%[[i1]], %[[j]]] : memref<33x64xf64>
+// CHECK: }
+// CHECK: }
+// CHECK: return
//
func.func @add_dense(%arga: tensor<32x64xf64, #SparseMatrix>,
%argx: tensor<33x64xf64>) -> tensor<33x64xf64> {
+++ /dev/null
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=any-storage-inner-loop vl=8" -canonicalize | \
-// RUN: FileCheck %s
-
-#SparseMatrix = #sparse_tensor.encoding<{dimLevelType = ["dense","compressed"]}>
-
-#trait = {
- indexing_maps = [
- affine_map<(i,j) -> (i,j)>, // a (in)
- affine_map<(i,j) -> (i,j)>, // b (in)
- affine_map<(i,j) -> ()> // x (out)
- ],
- iterator_types = ["reduction", "reduction"]
-}
-
-// Verifies that the SIMD reductions in the two for-loops after the
-// while-loop are chained before horizontally reducing these back to scalar.
-//
-// CHECK-LABEL: func @sparse_matrix_sum(
-// CHECK-SAME: %[[VAL_0:.*]]: tensor<f64>,
-// CHECK-SAME: %[[VAL_1:.*]]: tensor<64x32xf64, #sparse_tensor.encoding<{{{.*}}}>>,
-// CHECK-SAME: %[[VAL_2:.*]]: tensor<64x32xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<f64> {
-// CHECK-DAG: %[[VAL_3:.*]] = arith.constant dense<0.000000e+00> : vector<8xf64>
-// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 8 : index
-// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 64 : index
-// CHECK-DAG: %[[VAL_8:.*]] = arith.constant 1 : index
-// CHECK: %[[VAL_9:.*]] = sparse_tensor.pointers %[[VAL_1]] {dimension = 1 : index} : tensor<64x32xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK: %[[VAL_10:.*]] = sparse_tensor.indices %[[VAL_1]] {dimension = 1 : index} : tensor<64x32xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<64x32xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK: %[[VAL_12:.*]] = sparse_tensor.pointers %[[VAL_2]] {dimension = 1 : index} : tensor<64x32xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK: %[[VAL_13:.*]] = sparse_tensor.indices %[[VAL_2]] {dimension = 1 : index} : tensor<64x32xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK: %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<64x32xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK: %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
-// CHECK: %[[VAL_16:.*]] = tensor.extract %[[VAL_0]][] : tensor<f64>
-// CHECK: %[[VAL_17:.*]] = scf.for %[[VAL_18:.*]] = %[[VAL_6]] to %[[VAL_7]] step %[[VAL_8]] iter_args(%[[VAL_19:.*]] = %[[VAL_16]]) -> (f64) {
-// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_18]]] : memref<?xindex>
-// CHECK: %[[VAL_21:.*]] = arith.addi %[[VAL_18]], %[[VAL_8]] : index
-// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_21]]] : memref<?xindex>
-// CHECK: %[[VAL_23:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_18]]] : memref<?xindex>
-// CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_18]], %[[VAL_8]] : index
-// CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_24]]] : memref<?xindex>
-// CHECK: %[[VAL_26:.*]]:3 = scf.while (%[[VAL_27:.*]] = %[[VAL_20]], %[[VAL_28:.*]] = %[[VAL_23]], %[[VAL_29:.*]] = %[[VAL_19]]) : (index, index, f64) -> (index, index, f64) {
-// CHECK: %[[VAL_30:.*]] = arith.cmpi ult, %[[VAL_27]], %[[VAL_22]] : index
-// CHECK: %[[VAL_31:.*]] = arith.cmpi ult, %[[VAL_28]], %[[VAL_25]] : index
-// CHECK: %[[VAL_32:.*]] = arith.andi %[[VAL_30]], %[[VAL_31]] : i1
-// CHECK: scf.condition(%[[VAL_32]]) %[[VAL_27]], %[[VAL_28]], %[[VAL_29]] : index, index, f64
-// CHECK: } do {
-// CHECK: ^bb0(%[[VAL_33:.*]]: index, %[[VAL_34:.*]]: index, %[[VAL_35:.*]]: f64):
-// CHECK: %[[VAL_36:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_33]]] : memref<?xindex>
-// CHECK: %[[VAL_37:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_34]]] : memref<?xindex>
-// CHECK: %[[VAL_38:.*]] = arith.cmpi ult, %[[VAL_37]], %[[VAL_36]] : index
-// CHECK: %[[VAL_39:.*]] = arith.select %[[VAL_38]], %[[VAL_37]], %[[VAL_36]] : index
-// CHECK: %[[VAL_40:.*]] = arith.cmpi eq, %[[VAL_36]], %[[VAL_39]] : index
-// CHECK: %[[VAL_41:.*]] = arith.cmpi eq, %[[VAL_37]], %[[VAL_39]] : index
-// CHECK: %[[VAL_42:.*]] = arith.andi %[[VAL_40]], %[[VAL_41]] : i1
-// CHECK: %[[VAL_43:.*]] = scf.if %[[VAL_42]] -> (f64) {
-// CHECK: %[[VAL_44:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_33]]] : memref<?xf64>
-// CHECK: %[[VAL_45:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_34]]] : memref<?xf64>
-// CHECK: %[[VAL_46:.*]] = arith.addf %[[VAL_44]], %[[VAL_45]] : f64
-// CHECK: %[[VAL_47:.*]] = arith.addf %[[VAL_35]], %[[VAL_46]] : f64
-// CHECK: scf.yield %[[VAL_47]] : f64
-// CHECK: } else {
-// CHECK: %[[VAL_48:.*]] = arith.cmpi eq, %[[VAL_36]], %[[VAL_39]] : index
-// CHECK: %[[VAL_49:.*]] = scf.if %[[VAL_48]] -> (f64) {
-// CHECK: %[[VAL_50:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_33]]] : memref<?xf64>
-// CHECK: %[[VAL_51:.*]] = arith.addf %[[VAL_35]], %[[VAL_50]] : f64
-// CHECK: scf.yield %[[VAL_51]] : f64
-// CHECK: } else {
-// CHECK: %[[VAL_52:.*]] = arith.cmpi eq, %[[VAL_37]], %[[VAL_39]] : index
-// CHECK: %[[VAL_53:.*]] = scf.if %[[VAL_52]] -> (f64) {
-// CHECK: %[[VAL_54:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_34]]] : memref<?xf64>
-// CHECK: %[[VAL_55:.*]] = arith.addf %[[VAL_35]], %[[VAL_54]] : f64
-// CHECK: scf.yield %[[VAL_55]] : f64
-// CHECK: } else {
-// CHECK: scf.yield %[[VAL_35]] : f64
-// CHECK: }
-// CHECK: scf.yield %[[VAL_56:.*]] : f64
-// CHECK: }
-// CHECK: scf.yield %[[VAL_57:.*]] : f64
-// CHECK: }
-// CHECK: %[[VAL_58:.*]] = arith.cmpi eq, %[[VAL_36]], %[[VAL_39]] : index
-// CHECK: %[[VAL_59:.*]] = arith.addi %[[VAL_33]], %[[VAL_8]] : index
-// CHECK: %[[VAL_60:.*]] = arith.select %[[VAL_58]], %[[VAL_59]], %[[VAL_33]] : index
-// CHECK: %[[VAL_61:.*]] = arith.cmpi eq, %[[VAL_37]], %[[VAL_39]] : index
-// CHECK: %[[VAL_62:.*]] = arith.addi %[[VAL_34]], %[[VAL_8]] : index
-// CHECK: %[[VAL_63:.*]] = arith.select %[[VAL_61]], %[[VAL_62]], %[[VAL_34]] : index
-// CHECK: scf.yield %[[VAL_60]], %[[VAL_63]], %[[VAL_64:.*]] : index, index, f64
-// CHECK: }
-// CHECK: %[[VAL_65:.*]] = vector.insertelement %[[VAL_66:.*]]#2, %[[VAL_3]]{{\[}}%[[VAL_6]] : index] : vector<8xf64>
-// CHECK: %[[VAL_67:.*]] = scf.for %[[VAL_68:.*]] = %[[VAL_66]]#0 to %[[VAL_22]] step %[[VAL_4]] iter_args(%[[VAL_69:.*]] = %[[VAL_65]]) -> (vector<8xf64>) {
-// CHECK: %[[VAL_70:.*]] = affine.min #map(%[[VAL_22]], %[[VAL_68]])
-// CHECK: %[[VAL_71:.*]] = vector.create_mask %[[VAL_70]] : vector<8xi1>
-// CHECK: %[[VAL_72:.*]] = vector.maskedload %[[VAL_11]]{{\[}}%[[VAL_68]]], %[[VAL_71]], %[[VAL_3]] : memref<?xf64>, vector<8xi1>, vector<8xf64> into vector<8xf64>
-// CHECK: %[[VAL_73:.*]] = arith.addf %[[VAL_69]], %[[VAL_72]] : vector<8xf64>
-// CHECK: %[[VAL_74:.*]] = arith.select %[[VAL_71]], %[[VAL_73]], %[[VAL_69]] : vector<8xi1>, vector<8xf64>
-// CHECK: scf.yield %[[VAL_74]] : vector<8xf64>
-// CHECK: }
-// CHECK: %[[VAL_75:.*]] = scf.for %[[VAL_76:.*]] = %[[VAL_66]]#1 to %[[VAL_25]] step %[[VAL_4]] iter_args(%[[VAL_77:.*]] = %[[VAL_78:.*]]) -> (vector<8xf64>) {
-// CHECK: %[[VAL_79:.*]] = affine.min #map(%[[VAL_25]], %[[VAL_76]])
-// CHECK: %[[VAL_80:.*]] = vector.create_mask %[[VAL_79]] : vector<8xi1>
-// CHECK: %[[VAL_81:.*]] = vector.maskedload %[[VAL_14]]{{\[}}%[[VAL_76]]], %[[VAL_80]], %[[VAL_3]] : memref<?xf64>, vector<8xi1>, vector<8xf64> into vector<8xf64>
-// CHECK: %[[VAL_82:.*]] = arith.addf %[[VAL_77]], %[[VAL_81]] : vector<8xf64>
-// CHECK: %[[VAL_83:.*]] = arith.select %[[VAL_80]], %[[VAL_82]], %[[VAL_77]] : vector<8xi1>, vector<8xf64>
-// CHECK: scf.yield %[[VAL_83]] : vector<8xf64>
-// CHECK: }
-// CHECK: %[[VAL_84:.*]] = vector.reduction <add>, %[[VAL_85:.*]] : vector<8xf64> into f64
-// CHECK: scf.yield %[[VAL_84]] : f64
-// CHECK: }
-// CHECK: memref.store %[[VAL_86:.*]], %[[VAL_15]][] : memref<f64>
-// CHECK: %[[VAL_87:.*]] = bufferization.to_tensor %[[VAL_15]] : memref<f64>
-// CHECK: return %[[VAL_87]] : tensor<f64>
-// CHECK: }
-func.func @sparse_matrix_sum(%argx: tensor<f64>,
- %arga: tensor<64x32xf64, #SparseMatrix>,
- %argb: tensor<64x32xf64, #SparseMatrix>) -> tensor<f64> {
- %0 = linalg.generic #trait
- ins(%arga, %argb: tensor<64x32xf64, #SparseMatrix>,
- tensor<64x32xf64, #SparseMatrix>)
- outs(%argx: tensor<f64>) {
- ^bb(%a: f64, %b: f64, %x: f64):
- %m = arith.addf %a, %b : f64
- %t = arith.addf %x, %m : f64
- linalg.yield %t : f64
- } -> tensor<f64>
- return %0 : tensor<f64>
-}
+++ /dev/null
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// The script is designed to make adding checks to
-// a test case fast, it is *not* designed to be authoritative
-// about what constitutes a good test! The CHECK should be
-// minimized and named to reflect the test intent.
-
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=any-storage-inner-loop vl=8" -canonicalize | \
-// RUN: FileCheck %s
-
-#SparseVector = #sparse_tensor.encoding<{
- dimLevelType = ["compressed"]
-}>
-
-#trait_1d = {
- indexing_maps = [
- affine_map<(i) -> (i)>, // a
- affine_map<(i) -> (i)> // x (out)
- ],
- iterator_types = ["parallel"],
- doc = "X(i) = a(i) op i"
-}
-
-// CHECK-LABEL: func @sparse_index_1d_conj(
-// CHECK-SAME: %[[VAL_0:.*]]: tensor<8xi64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<8xi64> {
-// CHECK-DAG: %[[VAL_1:.*]] = arith.constant dense<0> : vector<8xi64>
-// CHECK-DAG: %[[VAL_2:.*]] = arith.constant dense<0> : vector<8xindex>
-// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 8 : index
-// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : i64
-// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
-// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
-// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xi64>
-// CHECK-DAG: %[[VAL_10a:.*]] = tensor.empty() : tensor<8xi64>
-// CHECK-DAG: %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_10a]] : memref<8xi64>
-// CHECK-DAG: linalg.fill ins(%[[VAL_5]] : i64) outs(%[[VAL_10]] : memref<8xi64>)
-// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
-// CHECK-DAG: %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
-// CHECK: scf.for %[[VAL_13:.*]] = %[[VAL_11]] to %[[VAL_12]] step %[[VAL_3]] {
-// CHECK: %[[VAL_14:.*]] = affine.min #map0(%[[VAL_13]]){{\[}}%[[VAL_12]]]
-// CHECK: %[[VAL_15:.*]] = vector.create_mask %[[VAL_14]] : vector<8xi1>
-// CHECK: %[[VAL_16:.*]] = vector.maskedload %[[VAL_8]]{{\[}}%[[VAL_13]]], %[[VAL_15]], %[[VAL_2]] : memref<?xindex>, vector<8xi1>, vector<8xindex> into vector<8xindex>
-// CHECK: %[[VAL_17:.*]] = vector.maskedload %[[VAL_9]]{{\[}}%[[VAL_13]]], %[[VAL_15]], %[[VAL_1]] : memref<?xi64>, vector<8xi1>, vector<8xi64> into vector<8xi64>
-// CHECK: %[[VAL_18:.*]] = arith.index_cast %[[VAL_16]] : vector<8xindex> to vector<8xi64>
-// CHECK: %[[VAL_19:.*]] = arith.muli %[[VAL_17]], %[[VAL_18]] : vector<8xi64>
-// CHECK: vector.scatter %[[VAL_10]]{{\[}}%[[VAL_6]]] {{\[}}%[[VAL_16]]], %[[VAL_15]], %[[VAL_19]] : memref<8xi64>, vector<8xindex>, vector<8xi1>, vector<8xi64>
-// CHECK: }
-// CHECK: %[[VAL_20:.*]] = bufferization.to_tensor %[[VAL_10]] : memref<8xi64>
-// CHECK: return %[[VAL_20]] : tensor<8xi64>
-// CHECK: }
-func.func @sparse_index_1d_conj(%arga: tensor<8xi64, #SparseVector>) -> tensor<8xi64> {
- %init = tensor.empty() : tensor<8xi64>
- %r = linalg.generic #trait_1d
- ins(%arga: tensor<8xi64, #SparseVector>)
- outs(%init: tensor<8xi64>) {
- ^bb(%a: i64, %x: i64):
- %i = linalg.index 0 : index
- %ii = arith.index_cast %i : index to i64
- %m1 = arith.muli %a, %ii : i64
- linalg.yield %m1 : i64
- } -> tensor<8xi64>
- return %r : tensor<8xi64>
-}
-
-// CHECK-LABEL: func @sparse_index_1d_disj(
-// CHECK-SAME: %[[VAL_0:.*]]: tensor<8xi64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<8xi64> {
-// CHECK-DAG: %[[VAL_1:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
-// CHECK-DAG: %[[VAL_2:.*]] = arith.constant 1 : index
-// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i64
-// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 8 : index
-// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
-// CHECK-DAG: %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_0]] {dimension = 0 : index} : tensor<8xi64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
-// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xi64>
-// CHECK-DAG: %[[VAL_9a:.*]] = tensor.empty() : tensor<8xi64>
-// CHECK-DAG: %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_9a]] : memref<8xi64>
-// CHECK-DAG: linalg.fill ins(%[[VAL_3]] : i64) outs(%[[VAL_9]] : memref<8xi64>)
-// CHECK-DAG: %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-DAG: %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_2]]] : memref<?xindex>
-// CHECK: %[[VAL_12:.*]]:2 = scf.while (%[[VAL_13:.*]] = %[[VAL_10]], %[[VAL_14:.*]] = %[[VAL_5]]) : (index, index) -> (index, index) {
-// CHECK: %[[VAL_15:.*]] = arith.cmpi ult, %[[VAL_13]], %[[VAL_11]] : index
-// CHECK: scf.condition(%[[VAL_15]]) %[[VAL_13]], %[[VAL_14]] : index, index
-// CHECK: } do {
-// CHECK: ^bb0(%[[VAL_16:.*]]: index, %[[VAL_17:.*]]: index):
-// CHECK: %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_16]]] : memref<?xindex>
-// CHECK: %[[VAL_19:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_17]] : index
-// CHECK: scf.if %[[VAL_19]] {
-// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_16]]] : memref<?xi64>
-// CHECK: %[[VAL_21:.*]] = arith.index_cast %[[VAL_17]] : index to i64
-// CHECK: %[[VAL_22:.*]] = arith.addi %[[VAL_20]], %[[VAL_21]] : i64
-// CHECK: memref.store %[[VAL_22]], %[[VAL_9]]{{\[}}%[[VAL_17]]] : memref<8xi64>
-// CHECK: } else {
-// CHECK: %[[VAL_23:.*]] = arith.index_cast %[[VAL_17]] : index to i64
-// CHECK: memref.store %[[VAL_23]], %[[VAL_9]]{{\[}}%[[VAL_17]]] : memref<8xi64>
-// CHECK: }
-// CHECK: %[[VAL_24:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_17]] : index
-// CHECK: %[[VAL_25:.*]] = arith.addi %[[VAL_16]], %[[VAL_2]] : index
-// CHECK: %[[VAL_26:.*]] = arith.select %[[VAL_24]], %[[VAL_25]], %[[VAL_16]] : index
-// CHECK: %[[VAL_27:.*]] = arith.addi %[[VAL_17]], %[[VAL_2]] : index
-// CHECK: scf.yield %[[VAL_26]], %[[VAL_27]] : index, index
-// CHECK: }
-// CHECK: scf.for %[[VAL_28:.*]] = %[[VAL_29:.*]]#1 to %[[VAL_4]] step %[[VAL_4]] {
-// CHECK: %[[VAL_30:.*]] = affine.min #map1(%[[VAL_28]])
-// CHECK: %[[VAL_31:.*]] = vector.create_mask %[[VAL_30]] : vector<8xi1>
-// CHECK: %[[VAL_32:.*]] = vector.broadcast %[[VAL_28]] : index to vector<8xindex>
-// CHECK: %[[VAL_33:.*]] = arith.addi %[[VAL_32]], %[[VAL_1]] : vector<8xindex>
-// CHECK: %[[VAL_34:.*]] = arith.index_cast %[[VAL_33]] : vector<8xindex> to vector<8xi64>
-// CHECK: vector.maskedstore %[[VAL_9]]{{\[}}%[[VAL_28]]], %[[VAL_31]], %[[VAL_34]] : memref<8xi64>, vector<8xi1>, vector<8xi64>
-// CHECK: }
-// CHECK: %[[VAL_35:.*]] = bufferization.to_tensor %[[VAL_9]] : memref<8xi64>
-// CHECK: return %[[VAL_35]] : tensor<8xi64>
-// CHECK: }
-func.func @sparse_index_1d_disj(%arga: tensor<8xi64, #SparseVector>) -> tensor<8xi64> {
- %init = tensor.empty() : tensor<8xi64>
- %r = linalg.generic #trait_1d
- ins(%arga: tensor<8xi64, #SparseVector>)
- outs(%init: tensor<8xi64>) {
- ^bb(%a: i64, %x: i64):
- %i = linalg.index 0 : index
- %ii = arith.index_cast %i : index to i64
- %m1 = arith.addi %a, %ii : i64
- linalg.yield %m1 : i64
- } -> tensor<8xi64>
- return %r : tensor<8xi64>
-}
+++ /dev/null
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=any-storage-inner-loop vl=16" -scf-for-loop-peeling -canonicalize | \
-// RUN: FileCheck %s
-
-#SparseVector = #sparse_tensor.encoding<{
- dimLevelType = [ "compressed" ],
- pointerBitWidth = 32,
- indexBitWidth = 32
-}>
-
-#trait_mul_s = {
- indexing_maps = [
- affine_map<(i) -> (i)>, // a
- affine_map<(i) -> (i)>, // b
- affine_map<(i) -> (i)> // x (out)
- ],
- iterator_types = ["parallel"],
- doc = "x(i) = a(i) * b(i)"
-}
-
-// CHECK-DAG: #[[$map0:.*]] = affine_map<()[s0, s1] -> (s0 + ((-s0 + s1) floordiv 16) * 16)>
-// CHECK-DAG: #[[$map1:.*]] = affine_map<(d0)[s0] -> (-d0 + s0)>
-// CHECK-LABEL: func @mul_s
-// CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
-// CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
-// CHECK-DAG: %[[c16:.*]] = arith.constant 16 : index
-// CHECK: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK: %[[a:.*]] = arith.extui %[[p]] : i32 to i64
-// CHECK: %[[q:.*]] = arith.index_cast %[[a]] : i64 to index
-// CHECK: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK: %[[b:.*]] = arith.extui %[[r]] : i32 to i64
-// CHECK: %[[s:.*]] = arith.index_cast %[[b]] : i64 to index
-// CHECK: %[[boundary:.*]] = affine.apply #[[$map0]]()[%[[q]], %[[s]]]
-// CHECK: scf.for %[[i:.*]] = %[[q]] to %[[boundary]] step %[[c16]] {
-// CHECK: %[[mask:.*]] = vector.constant_mask [16] : vector<16xi1>
-// CHECK: %[[li:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xi32>, vector<16xi32>
-// CHECK: %[[zi:.*]] = arith.extui %[[li]] : vector<16xi32> to vector<16xi64>
-// CHECK: %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
-// CHECK: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK: %[[m:.*]] = arith.mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK: vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
-// CHECK: }
-// CHECK: scf.for %[[i2:.*]] = %[[boundary]] to %[[s]] step %[[c16]] {
-// CHECK: %[[sub:.*]] = affine.apply #[[$map1]](%[[i2]])[%[[s]]]
-// CHECK: %[[mask2:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
-// CHECK: %[[li2:.*]] = vector.maskedload %{{.*}}[%[[i2]]], %[[mask2]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
-// CHECK: %[[zi2:.*]] = arith.extui %[[li2]] : vector<16xi32> to vector<16xi64>
-// CHECK: %[[la2:.*]] = vector.maskedload %{{.*}}[%[[i2]]], %[[mask2]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK: %[[lb2:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK: %[[m2:.*]] = arith.mulf %[[la2]], %[[lb2]] : vector<16xf32>
-// CHECK: vector.scatter %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %[[m2]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
-// CHECK: }
-// CHECK: return
-//
-func.func @mul_s(%arga: tensor<1024xf32, #SparseVector>, %argb: tensor<1024xf32>, %argx: tensor<1024xf32>) -> tensor<1024xf32> {
- %0 = linalg.generic #trait_mul_s
- ins(%arga, %argb: tensor<1024xf32, #SparseVector>, tensor<1024xf32>)
- outs(%argx: tensor<1024xf32>) {
- ^bb(%a: f32, %b: f32, %x: f32):
- %0 = arith.mulf %a, %b : f32
- linalg.yield %0 : f32
- } -> tensor<1024xf32>
- return %0 : tensor<1024xf32>
-}
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \
-// RUN: mlir-cpu-runner \
-// RUN: -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=4" | \
-// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/test.tns" \
-// RUN: mlir-cpu-runner \
-// RUN: -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
!Filename = !llvm.ptr<i8>
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=4" | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
#SparseVector = #sparse_tensor.encoding<{
dimLevelType = ["compressed"]
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=16 enable-simd-index32" | \
-// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \
-// RUN: mlir-cpu-runner \
-// RUN: -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
!Filename = !llvm.ptr<i8>
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=4" | \
-// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/mttkrp_b.tns" \
-// RUN: mlir-cpu-runner \
-// RUN: -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
!Filename = !llvm.ptr<i8>
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=4" | \
-// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
-// RUN: mlir-cpu-runner \
-// RUN: -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
!Filename = !llvm.ptr<i8>
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
#DCSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s -sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=8" | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
#SV = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
#DV = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }>
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s \
-// RUN: --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=4 enable-simd-index32" | \
-// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \
-// RUN: mlir-cpu-runner \
-// RUN: -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
-//
!Filename = !llvm.ptr<i8>
// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s -sparse-compiler="vl=8" | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
#SM = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=4" | \
-// RUN: mlir-cpu-runner \
-// RUN: -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
#CSR = #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \
-// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/wide.mtx" \
-// RUN: mlir-cpu-runner \
-// RUN: -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
!Filename = !llvm.ptr<i8>
// RUN: -e entry -entry-point-result=void \
// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
// RUN: FileCheck %s
-//
-// Do the same run, but now with SIMDization as well. This should not change the outcome.
-//
-// RUN: mlir-opt %s --sparse-compiler="vectorization-strategy=any-storage-inner-loop vl=2" | \
-// RUN: TENSOR0="%mlir_src_dir/test/Integration/data/test_symmetric.mtx" \
-// RUN: mlir-cpu-runner \
-// RUN: -e entry -entry-point-result=void \
-// RUN: -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
-// RUN: FileCheck %s
!Filename = !llvm.ptr<i8>
ir.AffineMap.get_permutation([0, 1]),
ir.AffineMap.get_permutation([1, 0])
]
- vec_strategy = ['none', 'dense-inner-loop']
for level in levels:
for ordering in orderings:
for pwidth in [32]:
for iwidth in [32]:
- for vec in vec_strategy:
- for e in [True]:
- vl = 1 if vec == 0 else 16
- attr = st.EncodingAttr.get(level, ordering, None, pwidth,
- iwidth)
- opt = (f'parallelization-strategy=none '
- f'vectorization-strategy={vec} '
- f'vl={vl} enable-simd-index32={e}')
- compiler = sparse_compiler.SparseCompiler(
- options=opt, opt_level=0, shared_libs=[support_lib])
- build_compile_and_run_SDDMMM(attr, compiler)
- count = count + 1
- # CHECK: Passed 16 tests
+ for e in [True]:
+ attr = st.EncodingAttr.get(level, ordering, None, pwidth,
+ iwidth)
+ opt = (f'parallelization-strategy=none')
+ compiler = sparse_compiler.SparseCompiler(
+ options=opt, opt_level=0, shared_libs=[support_lib])
+ build_compile_and_run_SDDMMM(attr, compiler)
+ count = count + 1
+ # CHECK: Passed 8 tests
print('Passed ', count, 'tests')
vl = 1
e = False
- opt = (f'parallelization-strategy=none '
- f'vectorization-strategy=none '
- f'vl={vl} enable-simd-index32={e}')
+ opt = (f'parallelization-strategy=none')
levels = [[st.DimLevelType.dense, st.DimLevelType.dense],
[st.DimLevelType.dense, st.DimLevelType.compressed],
[st.DimLevelType.compressed, st.DimLevelType.dense],
# CHECK-LABEL: TEST: test_stress
print("\nTEST: test_stress")
with ir.Context() as ctx, ir.Location.unknown():
- vl = 1
- e = False
# Disable direct sparse2sparse conversion, because it doubles the time!
# TODO: While direct s2s is far too slow for per-commit testing,
# we should have some framework ensure that we run this test with
s2s = 1
sparsification_options = (
f'parallelization-strategy=none '
- f'vectorization-strategy=none '
- f'vl={vl} '
- f'enable-simd-index32={e} '
f's2s-strategy={s2s}')
compiler = sparse_compiler.SparseCompiler(
options=sparsification_options, opt_level=0, shared_libs=[support_lib])
":Support",
":TensorDialect",
":Transforms",
- ":VectorDialect",
"//llvm:Support",
],
)