name = "sparsePointers64";
else if (eltType.isInteger(32))
name = "sparsePointers32";
+ else if (eltType.isInteger(16))
+ name = "sparsePointers16";
+ else if (eltType.isInteger(8))
+ name = "sparsePointers8";
else
return failure();
rewriter.replaceOpWithNewOp<CallOp>(
name = "sparseIndices64";
else if (eltType.isInteger(32))
name = "sparseIndices32";
+ else if (eltType.isInteger(16))
+ name = "sparseIndices16";
+ else if (eltType.isInteger(8))
+ name = "sparseIndices8";
else
return failure();
rewriter.replaceOpWithNewOp<CallOp>(
}
}
+/// Constructs vector type.
+static VectorType vectorType(CodeGen &codegen, Type etp) {
+ return VectorType::get(codegen.curVecLength, etp);
+}
+
/// Constructs vector type from pointer.
static VectorType vectorType(CodeGen &codegen, Value ptr) {
- Type etp = ptr.getType().cast<MemRefType>().getElementType();
- return VectorType::get(codegen.curVecLength, etp);
+ return vectorType(codegen, ptr.getType().cast<MemRefType>().getElementType());
}
/// Constructs vector iteration mask.
static Value genVectorMask(CodeGen &codegen, PatternRewriter &rewriter,
Value iv, Value lo, Value hi, Value step) {
Location loc = iv.getLoc();
- VectorType mtp =
- VectorType::get(codegen.curVecLength, rewriter.getIntegerType(1));
+ VectorType mtp = vectorType(codegen, rewriter.getIntegerType(1));
// Special case if the vector length evenly divides the trip count (for
// example, "for i = 0, 128, 16"). A constant all-true mask is generated
// so that all subsequent masked memory operations are immediately folded
/// optimizations to hoist the invariant broadcast out of the vector loop.
static Value genVectorInvariantValue(CodeGen &codegen,
PatternRewriter &rewriter, Value val) {
- VectorType vtp = VectorType::get(codegen.curVecLength, val.getType());
+ VectorType vtp = vectorType(codegen, val.getType());
return rewriter.create<vector::BroadcastOp>(val.getLoc(), vtp, val);
}
rewriter.create<memref::StoreOp>(loc, rhs, ptr, args);
}
-/// Generates a pointer/index load from the sparse storage scheme.
+/// Generates a pointer/index load from the sparse storage scheme. Narrower
+/// data types need to be zero extended before casting the value into the
+/// index type used for looping and indexing.
static Value genLoad(CodeGen &codegen, PatternRewriter &rewriter, Location loc,
Value ptr, Value s) {
- if (codegen.curVecLength > 1)
- return genVectorLoad(codegen, rewriter, ptr, {s});
+ // See https://llvm.org/docs/GetElementPtr.html for some background on
+ // the complications described below.
+ if (codegen.curVecLength > 1) {
+ // Since the index vector is used in a subsequent gather/scatter operations,
+ // which effectively defines an unsigned pointer + signed index, we must
+ // zero extend the vector to an index width. For 8-bit and 16-bit values,
+ // an 32-bit index width suffices. For 32-bit values, zero extending the
+ // elements into 64-bit loses some performance since the 32-bit indexed
+ // gather/scatter is more efficient than the 64-bit index variant (in
+ // the future, we could introduce a flag that states the negative space
+ // of 32-bit indices is unused). For 64-bit values, there is no good way
+ // to state that the indices are unsigned, with creates the potential of
+ // incorrect address calculations in the unlikely case we need such
+ // extremely large offsets.
+ Type etp = ptr.getType().cast<MemRefType>().getElementType();
+ Value vload = genVectorLoad(codegen, rewriter, ptr, {s});
+ if (etp.getIntOrFloatBitWidth() < 32)
+ vload = rewriter.create<ZeroExtendIOp>(
+ loc, vload, vectorType(codegen, rewriter.getIntegerType(32)));
+ else if (etp.getIntOrFloatBitWidth() < 64)
+ vload = rewriter.create<ZeroExtendIOp>(
+ loc, vload, vectorType(codegen, rewriter.getIntegerType(64)));
+ return vload;
+ }
+ // For the scalar case, we simply zero extend narrower indices into 64-bit
+ // values before casting to index without a performance penalty. Here too,
+ // however, indices that already are 64-bit, in theory, cannot express the
+ // full range as explained above.
Value load = rewriter.create<memref::LoadOp>(loc, ptr, s);
- return load.getType().isa<IndexType>()
- ? load
- : rewriter.create<IndexCastOp>(loc, load, rewriter.getIndexType());
+ if (!load.getType().isa<IndexType>()) {
+ if (load.getType().getIntOrFloatBitWidth() < 64)
+ load = rewriter.create<ZeroExtendIOp>(loc, load,
+ rewriter.getIntegerType(64));
+ load = rewriter.create<IndexCastOp>(loc, load, rewriter.getIndexType());
+ }
+ return load;
}
/// Generates an invariant value.
if (!merger.isSparseTensor(t) && !linkedSparse(op, t)) {
auto map = op.getIndexingMap(t);
unsigned r = map.getNumResults();
- if (r && map.getDimPosition(r - 1) != idx)
- return false;
+ for (unsigned i = 0; i < r; i++) {
+ if (map.getDimPosition(i) == idx && i != r - 1)
+ return false;
+ }
}
}
return true;
class SparseTensorStorageBase {
public:
virtual uint64_t getDimSize(uint64_t) = 0;
+
+ // Overhead storage.
virtual void getPointers(std::vector<uint64_t> **, uint64_t) { fatal("p64"); }
virtual void getPointers(std::vector<uint32_t> **, uint64_t) { fatal("p32"); }
+ virtual void getPointers(std::vector<uint16_t> **, uint64_t) { fatal("p16"); }
+ virtual void getPointers(std::vector<uint8_t> **, uint64_t) { fatal("p8"); }
virtual void getIndices(std::vector<uint64_t> **, uint64_t) { fatal("i64"); }
virtual void getIndices(std::vector<uint32_t> **, uint64_t) { fatal("i32"); }
+ virtual void getIndices(std::vector<uint16_t> **, uint64_t) { fatal("i16"); }
+ virtual void getIndices(std::vector<uint8_t> **, uint64_t) { fatal("i8"); }
+
+ // Primary storage.
virtual void getValues(std::vector<double> **) { fatal("valf64"); }
virtual void getValues(std::vector<float> **) { fatal("valf32"); }
+
virtual ~SparseTensorStorageBase() {}
private:
uint64_t strides[1];
};
+struct MemRef1DU16 {
+ const uint16_t *base;
+ const uint16_t *data;
+ uint64_t off;
+ uint64_t sizes[1];
+ uint64_t strides[1];
+};
+
+struct MemRef1DU8 {
+ const uint8_t *base;
+ const uint8_t *data;
+ uint64_t off;
+ uint64_t sizes[1];
+ uint64_t strides[1];
+};
+
struct MemRef1DF64 {
const double *base;
const double *data;
uint64_t strides[1];
};
-enum TypeEnum : uint64_t { kF64 = 0, kF32 = 1, kU64 = 2, kU32 = 3 };
+enum OverheadTypeEnum : uint64_t { kU64 = 1, kU32 = 2, kU16 = 3, kU8 = 4 };
+enum PrimaryTypeEnum : uint64_t { kF64 = 1, kF32 = 2 };
+
+#define CASE(p, i, v, P, I, V) \
+ if (ptrTp == (p) && indTp == (i) && valTp == (v)) \
+ return newSparseTensor<P, I, V>(filename, sparsity, asize)
void *newSparseTensor(char *filename, bool *abase, bool *adata, uint64_t aoff,
uint64_t asize, uint64_t astride, uint64_t ptrTp,
uint64_t indTp, uint64_t valTp) {
assert(astride == 1);
bool *sparsity = abase + aoff;
- if (ptrTp == kU64 && indTp == kU64 && valTp == kF64)
- return newSparseTensor<uint64_t, uint64_t, double>(filename, sparsity,
- asize);
- if (ptrTp == kU64 && indTp == kU64 && valTp == kF32)
- return newSparseTensor<uint64_t, uint64_t, float>(filename, sparsity,
- asize);
- if (ptrTp == kU64 && indTp == kU32 && valTp == kF64)
- return newSparseTensor<uint64_t, uint32_t, double>(filename, sparsity,
- asize);
- if (ptrTp == kU64 && indTp == kU32 && valTp == kF32)
- return newSparseTensor<uint64_t, uint32_t, float>(filename, sparsity,
- asize);
- if (ptrTp == kU32 && indTp == kU64 && valTp == kF64)
- return newSparseTensor<uint32_t, uint64_t, double>(filename, sparsity,
- asize);
- if (ptrTp == kU32 && indTp == kU64 && valTp == kF32)
- return newSparseTensor<uint32_t, uint64_t, float>(filename, sparsity,
- asize);
- if (ptrTp == kU32 && indTp == kU32 && valTp == kF64)
- return newSparseTensor<uint32_t, uint32_t, double>(filename, sparsity,
- asize);
- if (ptrTp == kU32 && indTp == kU32 && valTp == kF32)
- return newSparseTensor<uint32_t, uint32_t, float>(filename, sparsity,
- asize);
+
+ // The most common cases: 64-bit or 32-bit overhead, double/float values.
+ CASE(kU64, kU64, kF64, uint64_t, uint64_t, double);
+ CASE(kU64, kU64, kF32, uint64_t, uint64_t, float);
+ CASE(kU64, kU32, kF64, uint64_t, uint32_t, double);
+ CASE(kU64, kU32, kF32, uint64_t, uint32_t, float);
+ CASE(kU32, kU64, kF64, uint32_t, uint64_t, double);
+ CASE(kU32, kU64, kF32, uint32_t, uint64_t, float);
+ CASE(kU32, kU32, kF64, uint32_t, uint32_t, double);
+ CASE(kU32, kU32, kF32, uint32_t, uint32_t, float);
+
+ // Some special cases: low overhead storage, double/float values.
+ CASE(kU16, kU16, kF64, uint16_t, uint16_t, double);
+ CASE(kU8, kU8, kF64, uint8_t, uint8_t, double);
+ CASE(kU16, kU16, kF32, uint16_t, uint16_t, float);
+ CASE(kU8, kU8, kF32, uint8_t, uint8_t, float);
+
+ // Unsupported case (add above if needed).
fputs("unsupported combination of types\n", stderr);
exit(1);
}
+#undef CASE
+
uint64_t sparseDimSize(void *tensor, uint64_t d) {
return static_cast<SparseTensorStorageBase *>(tensor)->getDimSize(d);
}
return {v->data(), v->data(), 0, {v->size()}, {1}};
}
+MemRef1DU16 sparsePointers16(void *tensor, uint64_t d) {
+ std::vector<uint16_t> *v;
+ static_cast<SparseTensorStorageBase *>(tensor)->getPointers(&v, d);
+ return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
+MemRef1DU8 sparsePointers8(void *tensor, uint64_t d) {
+ std::vector<uint8_t> *v;
+ static_cast<SparseTensorStorageBase *>(tensor)->getPointers(&v, d);
+ return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
MemRef1DU64 sparseIndices64(void *tensor, uint64_t d) {
std::vector<uint64_t> *v;
static_cast<SparseTensorStorageBase *>(tensor)->getIndices(&v, d);
return {v->data(), v->data(), 0, {v->size()}, {1}};
}
+MemRef1DU16 sparseIndices16(void *tensor, uint64_t d) {
+ std::vector<uint16_t> *v;
+ static_cast<SparseTensorStorageBase *>(tensor)->getIndices(&v, d);
+ return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
+MemRef1DU8 sparseIndices8(void *tensor, uint64_t d) {
+ std::vector<uint8_t> *v;
+ static_cast<SparseTensorStorageBase *>(tensor)->getIndices(&v, d);
+ return {v->data(), v->data(), 0, {v->size()}, {1}};
+}
+
MemRef1DF64 sparseValuesF64(void *tensor) {
std::vector<double> *v;
static_cast<SparseTensorStorageBase *>(tensor)->getValues(&v);
# Copy test data over.
file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test.mtx
${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/test.tns
+ ${CMAKE_CURRENT_SOURCE_DIR}/Integration/data/wide.mtx
DESTINATION ${MLIR_INTEGRATION_TEST_DIR}/data/)
endif()
// CHECK-TYPE1: %[[B1:.*]] = index_cast %[[P1]] : i64 to index
// CHECK-TYPE1: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
// CHECK-TYPE1: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi32>
-// CHECK-TYPE1: %[[INDC:.*]] = index_cast %[[IND0]] : i32 to index
+// CHECK-TYPE1: %[[ZEXT:.*]] = zexti %[[IND0]] : i32 to i64
+// CHECK-TYPE1: %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
// CHECK-TYPE1: %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
// CHECK-TYPE1: %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
// CHECK-TYPE1: %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
// CHECK-TYPE2: %[[C0:.*]] = constant 0 : index
// CHECK-TYPE2: %[[C1:.*]] = constant 1 : index
// CHECK-TYPE2: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi32>
-// CHECK-TYPE2: %[[B0:.*]] = index_cast %[[P0]] : i32 to index
+// CHECK-TYPE2: %[[Z0:.*]] = zexti %[[P0]] : i32 to i64
+// CHECK-TYPE2: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
// CHECK-TYPE2: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi32>
-// CHECK-TYPE2: %[[B1:.*]] = index_cast %[[P1]] : i32 to index
+// CHECK-TYPE2: %[[Z1:.*]] = zexti %[[P1]] : i32 to i64
+// CHECK-TYPE2: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
// CHECK-TYPE2: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
// CHECK-TYPE2: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi64>
// CHECK-TYPE2: %[[INDC:.*]] = index_cast %[[IND0]] : i64 to index
// CHECK-TYPE3: %[[C0:.*]] = constant 0 : index
// CHECK-TYPE3: %[[C1:.*]] = constant 1 : index
// CHECK-TYPE3: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi32>
-// CHECK-TYPE3: %[[B0:.*]] = index_cast %[[P0]] : i32 to index
+// CHECK-TYPE3: %[[Z0:.*]] = zexti %[[P0]] : i32 to i64
+// CHECK-TYPE3: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
// CHECK-TYPE3: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi32>
-// CHECK-TYPE3: %[[B1:.*]] = index_cast %[[P1]] : i32 to index
+// CHECK-TYPE3: %[[Z1:.*]] = zexti %[[P1]] : i32 to i64
+// CHECK-TYPE3: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
// CHECK-TYPE3: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
// CHECK-TYPE3: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi32>
-// CHECK-TYPE3: %[[INDC:.*]] = index_cast %[[IND0]] : i32 to index
+// CHECK-TYPE3: %[[ZEXT:.*]] = zexti %[[IND0]] : i32 to i64
+// CHECK-TYPE3: %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
// CHECK-TYPE3: %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
// CHECK-TYPE3: %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
// CHECK-TYPE3: %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
// CHECK-TYPE4: %[[C0:.*]] = constant 0 : index
// CHECK-TYPE4: %[[C1:.*]] = constant 1 : index
// CHECK-TYPE4: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi16>
-// CHECK-TYPE4: %[[B0:.*]] = index_cast %[[P0]] : i16 to index
+// CHECK-TYPE4: %[[Z0:.*]] = zexti %[[P0]] : i16 to i64
+// CHECK-TYPE4: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
// CHECK-TYPE4: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi16>
-// CHECK-TYPE4: %[[B1:.*]] = index_cast %[[P1]] : i16 to index
+// CHECK-TYPE4: %[[Z1:.*]] = zexti %[[P1]] : i16 to i64
+// CHECK-TYPE4: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
// CHECK-TYPE4: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
// CHECK-TYPE4: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi16>
-// CHECK-TYPE4: %[[INDC:.*]] = index_cast %[[IND0]] : i16 to index
+// CHECK-TYPE4: %[[ZEXT:.*]] = zexti %[[IND0]] : i16 to i64
+// CHECK-TYPE4: %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
// CHECK-TYPE4: %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
// CHECK-TYPE4: %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
// CHECK-TYPE4: %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
// CHECK-TYPE5: %[[C0:.*]] = constant 0 : index
// CHECK-TYPE5: %[[C1:.*]] = constant 1 : index
// CHECK-TYPE5: %[[P0:.*]] = memref.load %{{.*}}[%[[C0]]] : memref<?xi8>
-// CHECK-TYPE5: %[[B0:.*]] = index_cast %[[P0]] : i8 to index
+// CHECK-TYPE5: %[[Z0:.*]] = zexti %[[P0]] : i8 to i64
+// CHECK-TYPE5: %[[B0:.*]] = index_cast %[[Z0]] : i64 to index
// CHECK-TYPE5: %[[P1:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<?xi8>
-// CHECK-TYPE5: %[[B1:.*]] = index_cast %[[P1]] : i8 to index
+// CHECK-TYPE5: %[[Z1:.*]] = zexti %[[P1]] : i8 to i64
+// CHECK-TYPE5: %[[B1:.*]] = index_cast %[[Z1]] : i64 to index
// CHECK-TYPE5: scf.for %[[I:.*]] = %[[B0]] to %[[B1]] step %[[C1]] {
// CHECK-TYPE5: %[[IND0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xi8>
-// CHECK-TYPE5: %[[INDC:.*]] = index_cast %[[IND0]] : i8 to index
+// CHECK-TYPE5: %[[ZEXT:.*]] = zexti %[[IND0]] : i8 to i64
+// CHECK-TYPE5: %[[INDC:.*]] = index_cast %[[ZEXT]] : i64 to index
// CHECK-TYPE5: %[[VAL0:.*]] = memref.load %{{.*}}[%[[I]]] : memref<?xf64>
// CHECK-TYPE5: %[[VAL1:.*]] = memref.load %{{.*}}[%[[INDC]]] : memref<32xf64>
// CHECK-TYPE5: %[[MUL:.*]] = mulf %[[VAL0]], %[[VAL1]] : f64
// CHECK-VEC0-DAG: %[[c0:.*]] = constant 0 : index
// CHECK-VEC0-DAG: %[[c1:.*]] = constant 1 : index
// CHECK-VEC0: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC0: %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC0: %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC0: %[[q:.*]] = index_cast %[[a]] : i64 to index
// CHECK-VEC0: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC0: %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC0: %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC0: %[[s:.*]] = index_cast %[[b]] : i64 to index
// CHECK-VEC0: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
// CHECK-VEC0: %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC0: %[[ci:.*]] = index_cast %[[li]] : i32 to index
+// CHECK-VEC0: %[[zi:.*]] = zexti %[[li]] : i32 to i64
+// CHECK-VEC0: %[[ci:.*]] = index_cast %[[zi]] : i64 to index
// CHECK-VEC0: %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
// CHECK-VEC0: %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32>
// CHECK-VEC0: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index
// CHECK-VEC1-DAG: %[[c1:.*]] = constant 1 : index
// CHECK-VEC1: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC1: %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC1: %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC1: %[[q:.*]] = index_cast %[[a]] : i64 to index
// CHECK-VEC1: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC1: %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC1: %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC1: %[[s:.*]] = index_cast %[[b]] : i64 to index
// CHECK-VEC1: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c1]] {
// CHECK-VEC1: %[[li:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC1: %[[ci:.*]] = index_cast %[[li]] : i32 to index
+// CHECK-VEC1: %[[zi:.*]] = zexti %[[li]] : i32 to i64
+// CHECK-VEC1: %[[ci:.*]] = index_cast %[[zi]] : i64 to index
// CHECK-VEC1: %[[la:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xf32>
// CHECK-VEC1: %[[lb:.*]] = memref.load %{{.*}}[%[[ci]]] : memref<1024xf32>
// CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
// CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index
// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
// CHECK-VEC2: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC2: %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC2: %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC2: %[[q:.*]] = index_cast %[[a]] : i64 to index
// CHECK-VEC2: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC2: %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC2: %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC2: %[[s:.*]] = index_cast %[[b]] : i64 to index
// CHECK-VEC2: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
// CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[i]] : index
// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
// CHECK-VEC2: %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2: %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64>
// CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
// CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2: vector.scatter %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-VEC2: vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
// CHECK-VEC2: }
// CHECK-VEC2: return
//
// CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index
// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
// CHECK-VEC2: %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
-// CHECK-VEC2: %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC2: %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC2: %[[q:.*]] = index_cast %[[a]] : i64 to index
// CHECK-VEC2: %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
-// CHECK-VEC2: %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC2: %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC2: %[[s:.*]] = index_cast %[[b]] : i64 to index
// CHECK-VEC2: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
// CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[i]] : index
// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
// CHECK-VEC2: %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2: %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64>
// CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
// CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2: vector.scatter %{{.*}}[%[[c0]]] [%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-VEC2: vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
// CHECK-VEC2: }
// CHECK-VEC2: return
//
// CHECK-VEC0-DAG: %[[c512:.*]] = constant 512 : index
// CHECK-VEC0: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
// CHECK-VEC0: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC0: %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC0: %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC0: %[[q:.*]] = index_cast %[[a]] : i64 to index
// CHECK-VEC0: %[[a:.*]] = addi %[[i]], %[[c1]] : index
// CHECK-VEC0: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC0: %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC0: %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC0: %[[s:.*]] = index_cast %[[b]] : i64 to index
// CHECK-VEC0: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
// CHECK-VEC0: %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xi32>
-// CHECK-VEC0: %[[cj:.*]] = index_cast %[[lj]] : i32 to index
+// CHECK-VEC0: %[[zj:.*]] = zexti %[[lj]] : i32 to i64
+// CHECK-VEC0: %[[cj:.*]] = index_cast %[[zj]] : i64 to index
// CHECK-VEC0: %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xf32>
// CHECK-VEC0: %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
// CHECK-VEC0: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
// CHECK-VEC1-DAG: %[[c512:.*]] = constant 512 : index
// CHECK-VEC1: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
// CHECK-VEC1: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC1: %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC1: %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC1: %[[q:.*]] = index_cast %[[a]] : i64 to index
// CHECK-VEC1: %[[a:.*]] = addi %[[i]], %[[c1]] : index
// CHECK-VEC1: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC1: %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC1: %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC1: %[[s:.*]] = index_cast %[[b]] : i64 to index
// CHECK-VEC1: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c1]] {
// CHECK-VEC1: %[[lj:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xi32>
-// CHECK-VEC1: %[[cj:.*]] = index_cast %[[lj]] : i32 to index
+// CHECK-VEC1: %[[zj:.*]] = zexti %[[lj]] : i32 to i64
+// CHECK-VEC1: %[[cj:.*]] = index_cast %[[zj]] : i64 to index
// CHECK-VEC1: %[[la:.*]] = memref.load %{{.*}}[%[[j]]] : memref<?xf32>
// CHECK-VEC1: %[[lb:.*]] = memref.load %{{.*}}[%[[i]], %[[cj]]] : memref<512x1024xf32>
// CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : f32
// CHECK-VEC2-DAG: %[[c512:.*]] = constant 512 : index
// CHECK-VEC2: scf.for %[[i:.*]] = %[[c0]] to %[[c512]] step %[[c1]] {
// CHECK-VEC2: %[[p:.*]] = memref.load %{{.*}}[%[[i]]] : memref<?xi32>
-// CHECK-VEC2: %[[q:.*]] = index_cast %[[p]] : i32 to index
+// CHECK-VEC2: %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK-VEC2: %[[q:.*]] = index_cast %[[a]] : i64 to index
// CHECK-VEC2: %[[a:.*]] = addi %[[i]], %[[c1]] : index
// CHECK-VEC2: %[[r:.*]] = memref.load %{{.*}}[%[[a]]] : memref<?xi32>
-// CHECK-VEC2: %[[s:.*]] = index_cast %[[r]] : i32 to index
+// CHECK-VEC2: %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK-VEC2: %[[s:.*]] = index_cast %[[b]] : i64 to index
// CHECK-VEC2: scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
// CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[j]] : index
// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
// CHECK-VEC2: %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK-VEC2: %[[zj:.*]] = zexti %[[lj]] : vector<16xi32> to vector<16xi64>
// CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
-// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %{{.*}} : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
// CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
-// CHECK-VEC2: vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[lj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-VEC2: vector.scatter %{{.*}}[%[[i]], %[[c0]]] [%[[zj]]], %[[mask]], %[[m]] : memref<512x1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
// CHECK-VEC2: }
// CHECK-VEC2: }
// CHECK-VEC2: return
func private @getTensorFilename(index) -> (!Filename)
func private @newSparseTensor(!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
func private @delSparseTensor(!SparseTensor) -> ()
- func private @print_memref_f32(%ptr : tensor<*xf32>)
//
// Main driver that reads matrix from file and calls the sparse kernel.
%sparse = constant true
memref.store %sparse, %annotations[%c0] : memref<?xi1>
memref.store %sparse, %annotations[%c1] : memref<?xi1>
- %i32 = constant 3 : index
- %f32 = constant 1 : index
+ %i32 = constant 2 : index
+ %f32 = constant 2 : index
// Setup memory for the dense matrices and initialize.
%adata = memref.alloc(%c5, %c10) : memref<?x?xf32>
func private @getTensorFilename(index) -> (!Filename)
func private @newSparseTensor(!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
func private @delSparseTensor(!SparseTensor) -> ()
- func private @print_memref_f64(%ptr : tensor<*xf64>)
//
// Main driver that reads matrix from file and calls the sparse kernel.
%sparse = constant true
memref.store %sparse, %annotations[%c0] : memref<?xi1>
memref.store %sparse, %annotations[%c1] : memref<?xi1>
- %i64 = constant 2 : index
- %f64 = constant 0 : index
+ %i64 = constant 1 : index
+ %f64 = constant 1 : index
// Setup memory for a single reduction scalar,
// initialized to zero.
--- /dev/null
+// RUN: mlir-opt %s \
+// RUN: --test-sparsification="lower ptr-type=4 ind-type=4" \
+// RUN: --convert-linalg-to-loops --convert-vector-to-scf --convert-scf-to-std \
+// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN: --std-bufferize --finalizing-bufferize \
+// RUN: --convert-vector-to-llvm --convert-std-to-llvm | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/wide.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN: -e entry -entry-point-result=void \
+// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+//
+// RUN: mlir-opt %s \
+// RUN: --test-sparsification="lower vectorization-strategy=2 ptr-type=4 ind-type=4 vl=16" \
+// RUN: --convert-linalg-to-loops --convert-vector-to-scf --convert-scf-to-std \
+// RUN: --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN: --std-bufferize --finalizing-bufferize \
+// RUN: --convert-vector-to-llvm --convert-std-to-llvm | \
+// RUN: TENSOR0="%mlir_integration_test_dir/data/wide.mtx" \
+// RUN: mlir-cpu-runner \
+// RUN: -e entry -entry-point-result=void \
+// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+//
+// Use descriptive names for opaque pointers.
+//
+!Filename = type !llvm.ptr<i8>
+!SparseTensor = type !llvm.ptr<i8>
+
+#matvec = {
+ indexing_maps = [
+ affine_map<(i,j) -> (i,j)>, // A
+ affine_map<(i,j) -> (j)>, // b
+ affine_map<(i,j) -> (i)> // x (out)
+ ],
+ sparse = [
+ [ "D", "S" ], // A
+ [ "D" ], // b
+ [ "D" ] // x
+ ],
+ iterator_types = ["parallel", "reduction"],
+ doc = "X(i) += A(i,j) * B(j)"
+}
+
+//
+// Integration test that lowers a kernel annotated as sparse to
+// actual sparse code, initializes a matching sparse storage scheme
+// from file, and runs the resulting code with the JIT compiler.
+//
+module {
+ //
+ // The kernel expressed as an annotated Linalg op. The kernel multiplies
+ // a sparse matrix A with a dense vector b into a dense vector x.
+ //
+ func @kernel_matvec(%argA: !SparseTensor,
+ %argb: tensor<?xf32>,
+ %argx: tensor<?xf32>) -> tensor<?xf32> {
+ %arga = linalg.sparse_tensor %argA : !SparseTensor to tensor<?x?xf32>
+ %0 = linalg.generic #matvec
+ ins(%arga, %argb: tensor<?x?xf32>, tensor<?xf32>)
+ outs(%argx: tensor<?xf32>) {
+ ^bb(%a: f32, %b: f32, %x: f32):
+ %0 = mulf %a, %b : f32
+ %1 = addf %x, %0 : f32
+ linalg.yield %1 : f32
+ } -> tensor<?xf32>
+ return %0 : tensor<?xf32>
+ }
+
+ //
+ // Runtime support library that is called directly from here.
+ //
+ func private @getTensorFilename(index) -> (!Filename)
+ func private @newSparseTensor(!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
+ func private @delSparseTensor(!SparseTensor) -> ()
+
+ //
+ // Main driver that reads matrix from file and calls the sparse kernel.
+ //
+ func @entry() {
+ %f0 = constant 0.0 : f32
+ %c0 = constant 0 : index
+ %c1 = constant 1 : index
+ %c2 = constant 2 : index
+ %c4 = constant 4 : index
+ %c256 = constant 256 : index
+
+ // Mark inner dimension of the matrix as sparse and encode the
+ // storage scheme types (this must match the metadata in the
+ // alias above and compiler switches). In this case, we test
+ // that 8-bit indices and pointers work correctly.
+ %annotations = memref.alloc(%c2) : memref<?xi1>
+ %sparse = constant true
+ %dense = constant false
+ memref.store %dense, %annotations[%c0] : memref<?xi1>
+ memref.store %sparse, %annotations[%c1] : memref<?xi1>
+ %u8 = constant 4 : index
+ %f32 = constant 2 : index
+
+ // Read the sparse matrix from file, construct sparse storage.
+ %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
+ %a = call @newSparseTensor(%fileName, %annotations, %u8, %u8, %f32)
+ : (!Filename, memref<?xi1>, index, index, index) -> (!SparseTensor)
+
+ // Initialize dense vectors.
+ %bdata = memref.alloc(%c256) : memref<?xf32>
+ %xdata = memref.alloc(%c4) : memref<?xf32>
+ scf.for %i = %c0 to %c256 step %c1 {
+ %k = addi %i, %c1 : index
+ %l = index_cast %k : index to i32
+ %f = sitofp %l : i32 to f32
+ memref.store %f, %bdata[%i] : memref<?xf32>
+ }
+ scf.for %i = %c0 to %c4 step %c1 {
+ memref.store %f0, %xdata[%i] : memref<?xf32>
+ }
+ %b = memref.tensor_load %bdata : memref<?xf32>
+ %x = memref.tensor_load %xdata : memref<?xf32>
+
+ // Call kernel.
+ %0 = call @kernel_matvec(%a, %b, %x)
+ : (!SparseTensor, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+
+ // Print the result for verification.
+ //
+ // CHECK: ( 1659, 1534, 21, 18315 )
+ //
+ %m = memref.buffer_cast %0 : memref<?xf32>
+ %v = vector.transfer_read %m[%c0], %f0: memref<?xf32>, vector<4xf32>
+ vector.print %v : vector<4xf32>
+
+ // Release the resources.
+ call @delSparseTensor(%a) : (!SparseTensor) -> ()
+ memref.dealloc %bdata : memref<?xf32>
+ memref.dealloc %xdata : memref<?xf32>
+
+ return
+ }
+}
--- /dev/null
+%%MatrixMarket matrix coordinate real general
+%
+% This is a test sparse matrix in Matrix Market Exchange Format.
+% see https://math.nist.gov/MatrixMarket
+%
+4 256 17
+1 1 1.0
+1 127 2.0
+1 128 3.0
+1 255 4.0
+2 2 5.0
+2 254 6.0
+3 3 7.0
+4 1 8.0
+4 2 9.0
+4 4 10.0
+4 99 11.0
+4 127 12.0
+4 128 13.0
+4 129 14.0
+4 250 15.0
+4 254 16.0
+4 256 17.0