From 3acf49829c0064d5bcea5d8f6ca032559bf8e73a Mon Sep 17 00:00:00 2001 From: Aart Bik Date: Tue, 6 Apr 2021 16:46:27 -0700 Subject: [PATCH] [mlir][sparse] support integral types i32,i16,i8 for *numerical* values Some sparse matrices operate on integral values (in contrast with the common f32 and f64 values). This CL expands the compiler and runtime support to deal with several common type combinations. Reviewed By: bixia Differential Revision: https://reviews.llvm.org/D99999 --- .../Dialect/Linalg/Transforms/SparseLowering.cpp | 6 + .../Dialect/Linalg/Transforms/Sparsification.cpp | 14 +- mlir/lib/ExecutionEngine/SparseUtils.cpp | 183 +++++++++------------ .../test/Integration/Sparse/CPU/sparse_matvec.mlir | 60 +++---- mlir/test/Integration/data/wide.mtx | 34 ++-- 5 files changed, 139 insertions(+), 158 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp b/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp index ef8f131..b1efd24 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/SparseLowering.cpp @@ -132,6 +132,12 @@ public: name = "sparseValuesF64"; else if (eltType.isF32()) name = "sparseValuesF32"; + else if (eltType.isInteger(32)) + name = "sparseValuesI32"; + else if (eltType.isInteger(16)) + name = "sparseValuesI16"; + else if (eltType.isInteger(8)) + name = "sparseValuesI8"; else return failure(); rewriter.replaceOpWithNewOp( diff --git a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp index 9ed3282..aa162bf8 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp @@ -837,11 +837,19 @@ static void genReductionEnd(Merger &merger, CodeGen &codegen, assert(codegen.curVecLength == 1); codegen.redVal = merger.exp(codegen.redExp).val = Value(); // end chain unsigned lhs = op.getNumShapedOperands() - 1; - if (red.getType().isa()) { + if (auto vtp = red.getType().dyn_cast()) { // TODO: assumes + reductions for now + StringAttr kind = rewriter.getStringAttr("add"); Value ld = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp); - red = rewriter.create( - op.getLoc(), ld.getType(), rewriter.getStringAttr("add"), red, ld); + // Integer reductions don't accept an accumulator. + if (vtp.getElementType().isa()) { + red = rewriter.create(op.getLoc(), ld.getType(), + kind, red, ValueRange{}); + red = rewriter.create(op.getLoc(), red, ld); + } else { + red = rewriter.create(op.getLoc(), ld.getType(), + kind, red, ld); + } } genTensorStore(merger, codegen, rewriter, op, lhs, red); } diff --git a/mlir/lib/ExecutionEngine/SparseUtils.cpp b/mlir/lib/ExecutionEngine/SparseUtils.cpp index 5b4af4c..8f0dd53 100644 --- a/mlir/lib/ExecutionEngine/SparseUtils.cpp +++ b/mlir/lib/ExecutionEngine/SparseUtils.cpp @@ -127,6 +127,9 @@ public: // Primary storage. virtual void getValues(std::vector **) { fatal("valf64"); } virtual void getValues(std::vector **) { fatal("valf32"); } + virtual void getValues(std::vector **) { fatal("vali32"); } + virtual void getValues(std::vector **) { fatal("vali16"); } + virtual void getValues(std::vector **) { fatal("vali8"); } virtual ~SparseTensorStorageBase() {} @@ -453,64 +456,58 @@ char *getTensorFilename(uint64_t id) { // implementation of a bufferized SparseTensor in MLIR. This could be replaced // by actual codegen in MLIR. // +// Because we cannot use C++ templates with C linkage, some macro magic is used +// to generate implementations for all required type combinations that can be +// called from MLIR generated code. +// //===----------------------------------------------------------------------===// -// Cannot use templates with C linkage. - -struct MemRef1DU64 { - const uint64_t *base; - const uint64_t *data; - uint64_t off; - uint64_t sizes[1]; - uint64_t strides[1]; -}; - -struct MemRef1DU32 { - const uint32_t *base; - const uint32_t *data; - uint64_t off; - uint64_t sizes[1]; - uint64_t strides[1]; -}; +#define TEMPLATE(NAME, TYPE) \ + struct NAME { \ + const TYPE *base; \ + const TYPE *data; \ + uint64_t off; \ + uint64_t sizes[1]; \ + uint64_t strides[1]; \ + } -struct MemRef1DU16 { - const uint16_t *base; - const uint16_t *data; - uint64_t off; - uint64_t sizes[1]; - uint64_t strides[1]; -}; +#define CASE(p, i, v, P, I, V) \ + if (ptrTp == (p) && indTp == (i) && valTp == (v)) \ + return newSparseTensor(filename, sparsity, asize) -struct MemRef1DU8 { - const uint8_t *base; - const uint8_t *data; - uint64_t off; - uint64_t sizes[1]; - uint64_t strides[1]; -}; +#define IMPL1(RET, NAME, TYPE, LIB) \ + RET NAME(void *tensor) { \ + std::vector *v; \ + static_cast(tensor)->LIB(&v); \ + return {v->data(), v->data(), 0, {v->size()}, {1}}; \ + } -struct MemRef1DF64 { - const double *base; - const double *data; - uint64_t off; - uint64_t sizes[1]; - uint64_t strides[1]; -}; +#define IMPL2(RET, NAME, TYPE, LIB) \ + RET NAME(void *tensor, uint64_t d) { \ + std::vector *v; \ + static_cast(tensor)->LIB(&v, d); \ + return {v->data(), v->data(), 0, {v->size()}, {1}}; \ + } -struct MemRef1DF32 { - const float *base; - const float *data; - uint64_t off; - uint64_t sizes[1]; - uint64_t strides[1]; -}; +TEMPLATE(MemRef1DU64, uint64_t); +TEMPLATE(MemRef1DU32, uint32_t); +TEMPLATE(MemRef1DU16, uint16_t); +TEMPLATE(MemRef1DU8, uint8_t); +TEMPLATE(MemRef1DI32, int32_t); +TEMPLATE(MemRef1DI16, int16_t); +TEMPLATE(MemRef1DI8, int8_t); +TEMPLATE(MemRef1DF64, double); +TEMPLATE(MemRef1DF32, float); enum OverheadTypeEnum : uint64_t { kU64 = 1, kU32 = 2, kU16 = 3, kU8 = 4 }; -enum PrimaryTypeEnum : uint64_t { kF64 = 1, kF32 = 2 }; -#define CASE(p, i, v, P, I, V) \ - if (ptrTp == (p) && indTp == (i) && valTp == (v)) \ - return newSparseTensor(filename, sparsity, asize) +enum PrimaryTypeEnum : uint64_t { + kF64 = 1, + kF32 = 2, + kI32 = 3, + kI16 = 4, + kI8 = 5 +}; void *newSparseTensor(char *filename, bool *abase, bool *adata, uint64_t aoff, uint64_t asize, uint64_t astride, uint64_t ptrTp, @@ -534,6 +531,17 @@ void *newSparseTensor(char *filename, bool *abase, bool *adata, uint64_t aoff, CASE(kU16, kU16, kF32, uint16_t, uint16_t, float); CASE(kU8, kU8, kF32, uint8_t, uint8_t, float); + // Integral matrices with low overhead storage. + CASE(kU32, kU32, kI32, uint32_t, uint32_t, int32_t); + CASE(kU32, kU32, kI16, uint32_t, uint32_t, int16_t); + CASE(kU32, kU32, kI8, uint32_t, uint32_t, int8_t); + CASE(kU16, kU16, kI32, uint16_t, uint16_t, int32_t); + CASE(kU16, kU16, kI16, uint16_t, uint16_t, int16_t); + CASE(kU16, kU16, kI8, uint16_t, uint16_t, int8_t); + CASE(kU8, kU8, kI32, uint8_t, uint8_t, int32_t); + CASE(kU8, kU8, kI16, uint8_t, uint8_t, int16_t); + CASE(kU8, kU8, kI8, uint8_t, uint8_t, int8_t); + // Unsupported case (add above if needed). fputs("unsupported combination of types\n", stderr); exit(1); @@ -545,70 +553,29 @@ uint64_t sparseDimSize(void *tensor, uint64_t d) { return static_cast(tensor)->getDimSize(d); } -MemRef1DU64 sparsePointers64(void *tensor, uint64_t d) { - std::vector *v; - static_cast(tensor)->getPointers(&v, d); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} - -MemRef1DU32 sparsePointers32(void *tensor, uint64_t d) { - std::vector *v; - static_cast(tensor)->getPointers(&v, d); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} - -MemRef1DU16 sparsePointers16(void *tensor, uint64_t d) { - std::vector *v; - static_cast(tensor)->getPointers(&v, d); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} - -MemRef1DU8 sparsePointers8(void *tensor, uint64_t d) { - std::vector *v; - static_cast(tensor)->getPointers(&v, d); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} - -MemRef1DU64 sparseIndices64(void *tensor, uint64_t d) { - std::vector *v; - static_cast(tensor)->getIndices(&v, d); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} - -MemRef1DU32 sparseIndices32(void *tensor, uint64_t d) { - std::vector *v; - static_cast(tensor)->getIndices(&v, d); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} - -MemRef1DU16 sparseIndices16(void *tensor, uint64_t d) { - std::vector *v; - static_cast(tensor)->getIndices(&v, d); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} - -MemRef1DU8 sparseIndices8(void *tensor, uint64_t d) { - std::vector *v; - static_cast(tensor)->getIndices(&v, d); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} - -MemRef1DF64 sparseValuesF64(void *tensor) { - std::vector *v; - static_cast(tensor)->getValues(&v); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} - -MemRef1DF32 sparseValuesF32(void *tensor) { - std::vector *v; - static_cast(tensor)->getValues(&v); - return {v->data(), v->data(), 0, {v->size()}, {1}}; -} +IMPL2(MemRef1DU64, sparsePointers64, uint64_t, getPointers) +IMPL2(MemRef1DU32, sparsePointers32, uint32_t, getPointers) +IMPL2(MemRef1DU16, sparsePointers16, uint16_t, getPointers) +IMPL2(MemRef1DU8, sparsePointers8, uint8_t, getPointers) +IMPL2(MemRef1DU64, sparseIndices64, uint64_t, getIndices) +IMPL2(MemRef1DU32, sparseIndices32, uint32_t, getIndices) +IMPL2(MemRef1DU16, sparseIndices16, uint16_t, getIndices) +IMPL2(MemRef1DU8, sparseIndices8, uint8_t, getIndices) +IMPL1(MemRef1DF64, sparseValuesF64, double, getValues) +IMPL1(MemRef1DF32, sparseValuesF32, float, getValues) +IMPL1(MemRef1DI32, sparseValuesI32, int32_t, getValues) +IMPL1(MemRef1DI16, sparseValuesI16, int16_t, getValues) +IMPL1(MemRef1DI8, sparseValuesI8, int8_t, getValues) void delSparseTensor(void *tensor) { delete static_cast(tensor); } +#undef TEMPLATE +#undef CASE +#undef IMPL1 +#undef IMPL2 + } // extern "C" #endif // MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS diff --git a/mlir/test/Integration/Sparse/CPU/sparse_matvec.mlir b/mlir/test/Integration/Sparse/CPU/sparse_matvec.mlir index 41ee9cc..fde4947 100644 --- a/mlir/test/Integration/Sparse/CPU/sparse_matvec.mlir +++ b/mlir/test/Integration/Sparse/CPU/sparse_matvec.mlir @@ -54,18 +54,18 @@ module { // a sparse matrix A with a dense vector b into a dense vector x. // func @kernel_matvec(%argA: !SparseTensor, - %argb: tensor, - %argx: tensor) -> tensor { - %arga = linalg.sparse_tensor %argA : !SparseTensor to tensor + %argb: tensor, + %argx: tensor) -> tensor { + %arga = linalg.sparse_tensor %argA : !SparseTensor to tensor %0 = linalg.generic #matvec - ins(%arga, %argb: tensor, tensor) - outs(%argx: tensor) { - ^bb(%a: f32, %b: f32, %x: f32): - %0 = mulf %a, %b : f32 - %1 = addf %x, %0 : f32 - linalg.yield %1 : f32 - } -> tensor - return %0 : tensor + ins(%arga, %argb: tensor, tensor) + outs(%argx: tensor) { + ^bb(%a: i32, %b: i32, %x: i32): + %0 = muli %a, %b : i32 + %1 = addi %x, %0 : i32 + linalg.yield %1 : i32 + } -> tensor + return %0 : tensor } // @@ -79,7 +79,7 @@ module { // Main driver that reads matrix from file and calls the sparse kernel. // func @entry() { - %f0 = constant 0.0 : f32 + %i0 = constant 0 : i32 %c0 = constant 0 : index %c1 = constant 1 : index %c2 = constant 2 : index @@ -89,51 +89,51 @@ module { // Mark inner dimension of the matrix as sparse and encode the // storage scheme types (this must match the metadata in the // alias above and compiler switches). In this case, we test - // that 8-bit indices and pointers work correctly. + // that 8-bit indices and pointers work correctly on a matrix + // with i32 elements. %annotations = memref.alloc(%c2) : memref %sparse = constant true %dense = constant false memref.store %dense, %annotations[%c0] : memref memref.store %sparse, %annotations[%c1] : memref %u8 = constant 4 : index - %f32 = constant 2 : index + %i32 = constant 3 : index // Read the sparse matrix from file, construct sparse storage. %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename) - %a = call @newSparseTensor(%fileName, %annotations, %u8, %u8, %f32) + %a = call @newSparseTensor(%fileName, %annotations, %u8, %u8, %i32) : (!Filename, memref, index, index, index) -> (!SparseTensor) // Initialize dense vectors. - %bdata = memref.alloc(%c256) : memref - %xdata = memref.alloc(%c4) : memref + %bdata = memref.alloc(%c256) : memref + %xdata = memref.alloc(%c4) : memref scf.for %i = %c0 to %c256 step %c1 { %k = addi %i, %c1 : index - %l = index_cast %k : index to i32 - %f = sitofp %l : i32 to f32 - memref.store %f, %bdata[%i] : memref + %j = index_cast %k : index to i32 + memref.store %j, %bdata[%i] : memref } scf.for %i = %c0 to %c4 step %c1 { - memref.store %f0, %xdata[%i] : memref + memref.store %i0, %xdata[%i] : memref } - %b = memref.tensor_load %bdata : memref - %x = memref.tensor_load %xdata : memref + %b = memref.tensor_load %bdata : memref + %x = memref.tensor_load %xdata : memref // Call kernel. %0 = call @kernel_matvec(%a, %b, %x) - : (!SparseTensor, tensor, tensor) -> tensor + : (!SparseTensor, tensor, tensor) -> tensor // Print the result for verification. // - // CHECK: ( 1659, 1534, 21, 18315 ) + // CHECK: ( 889, 1514, -21, -3431 ) // - %m = memref.buffer_cast %0 : memref - %v = vector.transfer_read %m[%c0], %f0: memref, vector<4xf32> - vector.print %v : vector<4xf32> + %m = memref.buffer_cast %0 : memref + %v = vector.transfer_read %m[%c0], %i0: memref, vector<4xi32> + vector.print %v : vector<4xi32> // Release the resources. call @delSparseTensor(%a) : (!SparseTensor) -> () - memref.dealloc %bdata : memref - memref.dealloc %xdata : memref + memref.dealloc %bdata : memref + memref.dealloc %xdata : memref return } diff --git a/mlir/test/Integration/data/wide.mtx b/mlir/test/Integration/data/wide.mtx index 6b5ee20..9e0d5f2 100644 --- a/mlir/test/Integration/data/wide.mtx +++ b/mlir/test/Integration/data/wide.mtx @@ -4,20 +4,20 @@ % see https://math.nist.gov/MatrixMarket % 4 256 17 -1 1 1.0 -1 127 2.0 -1 128 3.0 -1 255 4.0 -2 2 5.0 -2 254 6.0 -3 3 7.0 -4 1 8.0 -4 2 9.0 -4 4 10.0 -4 99 11.0 -4 127 12.0 -4 128 13.0 -4 129 14.0 -4 250 15.0 -4 254 16.0 -4 256 17.0 +1 1 -1 +1 127 2 +1 128 -3 +1 255 4 +2 2 -5 +2 254 6 +3 3 -7 +4 1 8 +4 2 -9 +4 4 10 +4 99 -11 +4 127 12 +4 128 -13 +4 129 14 +4 250 -15 +4 254 16 +4 256 -17 -- 2.7.4