From: musikisomorphie Date: Mon, 11 Feb 2019 03:31:38 +0000 (-0800) Subject: Improve the Sparse matrix multiplication computational speed #16187 (#16905) X-Git-Tag: accepted/tizen/6.5/unified/20211028.231830~1371 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c282afffa79e719e366b869914fc824dbb65684b;p=platform%2Fupstream%2Fpytorch.git Improve the Sparse matrix multiplication computational speed #16187 (#16905) Summary: Instead of converting coo to csr format of the sparse matrix in the original implementation, in my revision I directly use coo format for sparse dense matrix mutliplication. On my linux machine it is 5 times faster than the original code: ``` (original code) SIZE: 15000 DENSITY: 0.01 DEVICE: cpu torch: 0.39403 seconds np: 0.00496674 seconds torch/np: 79.3338 ---------------------------------------- (my update) SIZE: 15000 DENSITY: 0.01 DEVICE: cpu torch: 0.0812583 seconds np: 0.00501871 seconds torch/np: 16.1911 ``` Further code feedback and running time tests are highly welcomed. I will keep revise my code if needed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/16905 Differential Revision: D14020095 Pulled By: ezyang fbshipit-source-id: 4ab94075344a55b375f22421e97a690e682baed5 --- diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 51714a1..7548961 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -463,8 +463,8 @@ SparseTensor& mul_out_sparse_cpu(SparseTensor& r, const Tensor& t_, const Tensor // NB: OMP pragmas have to get their own functions; can't put them in lambdas template -void s_addmm_out_sparse_dense_worker(int64_t nnz, int64_t dim_i, int64_t dim_j, int64_t dim_k, Tensor& r, Scalar beta, const Tensor& t, Scalar alpha, const Tensor& csr, const Tensor& indices, const Tensor& values, const Tensor& dense) { - int64_t h, i; +void s_addmm_out_sparse_dense_worker(int64_t nnz, int64_t dim_i, int64_t dim_j, int64_t dim_k, Tensor& r, Scalar beta, const Tensor& t, Scalar alpha, const Tensor& indices, const Tensor& values, const Tensor& dense) { + int64_t i; // r_ = alpha * sparse * dense scalar_t cast_alpha = alpha.to(); @@ -479,7 +479,6 @@ void s_addmm_out_sparse_dense_worker(int64_t nnz, int64_t dim_i, int64_t dim_j, at::mul_out(r, t, scalar_to_tensor(beta)); } - auto csr_accessor = csr.accessor(); auto indices_accessor = indices.accessor(); auto values_accessor = values.accessor(); @@ -490,20 +489,20 @@ void s_addmm_out_sparse_dense_worker(int64_t nnz, int64_t dim_i, int64_t dim_j, int64_t dense_stride1 = dense.stride(1); int64_t r_stride0 = r.stride(0); int64_t r_stride1 = r.stride(1); -#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000) - for (h = 0; h < dim_i; h++) { - int64_t i_start = csr_accessor[h]; - int64_t i_end = csr_accessor[h+1]; - for (i = i_start; i < i_end; i++) { - scalar_t val = values_accessor[i]; - int64_t col = indices_accessor[1][i]; - if (col >= 0 && col < dim_j) { - THBlas_axpy(dim_k, + for (i = 0; i < nnz; i++) { + scalar_t val = values_accessor[i]; + int64_t row = indices_accessor[0][i]; + int64_t col = indices_accessor[1][i]; + if (col >= 0 && col < dim_j && row >= 0 && row < dim_i) { + THBlas_axpy(dim_k, cast_alpha * val, dense_ptr + col * dense_stride0, dense_stride1, - r_ptr + h * r_stride0, r_stride1); + r_ptr + row * r_stride0, r_stride1); + } else { + if (col < 0 || col >= dim_j) { + AT_ERROR("addmm: index out of column bound: ", col, " not between 1 and ", dim_j); } else { - AT_ERROR("addmm: index out of bound: ", col, " not between 1 and ", dim_j); + AT_ERROR("addmm: index out of row bound: ", row, " not between 1 and ", dim_i); } } } @@ -527,11 +526,9 @@ Tensor& s_addmm_out_sparse_dense_cpu( AT_CHECK(sparse_.dense_dim() == 0, "addmm: scalar values expected, got ", sparse_.dense_dim(), "D values"); AT_CHECK(dense.dim() == 2, "addmm: matrices expected, got ", dense.dim(), "D tensor"); - SparseTensor sparse = sparse_.coalesce(); - // ixj * jxk = ixk - int64_t dim_i = sparse.size(0); - int64_t dim_j = sparse.size(1); + int64_t dim_i = sparse_.size(0); + int64_t dim_j = sparse_.size(1); int64_t dim_k = dense.size(1); AT_CHECK(dense.size(0) == dim_j, @@ -543,20 +540,19 @@ Tensor& s_addmm_out_sparse_dense_cpu( r.resize_({dim_i, dim_k}); - int64_t nnz = sparse._nnz(); + int64_t nnz = sparse_._nnz(); if (nnz == 0) { at::mul_out(r, t, at::scalar_tensor(beta, r.options())); return r; } - LongTensor indices = sparse._indices(); - Tensor values = sparse._values(); - LongTensor csr = _to_csr(indices.data(), dim_i, nnz); + LongTensor indices = sparse_._indices(); + Tensor values = sparse_._values(); AT_DISPATCH_ALL_TYPES( values.type(), "addmm_sparse_dense", [&] { - s_addmm_out_sparse_dense_worker(nnz, dim_i, dim_j, dim_k, r, beta, t, alpha, csr, indices, values, dense); + s_addmm_out_sparse_dense_worker(nnz, dim_i, dim_j, dim_k, r, beta, t, alpha, indices, values, dense); } );