From 7ec55ab5eb032856e30705da3046e050e2257668 Mon Sep 17 00:00:00 2001 From: Jihoon Lee Date: Fri, 11 Dec 2020 17:13:07 +0900 Subject: [PATCH] [Optim] Add shortcut to dot product When dimension is 1, it is vector by matrix or vector by vector multiplication. This patch adds a shortcut in that situation **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Jihoon Lee --- nntrainer/tensor/blas_interface.cpp | 19 +++ nntrainer/tensor/blas_interface.h | 4 +- nntrainer/tensor/tensor.cpp | 32 +++- test/unittest/unittest_nntrainer_tensor.cpp | 243 ++++++++++++++++++++++++++++ 4 files changed, 294 insertions(+), 4 deletions(-) diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 30bf919..720e4e1 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -56,6 +56,16 @@ static void sgemv_raw(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, } } +static float sdot_raw(const unsigned int N, const float *X, + const unsigned int incX, const float *Y, + const unsigned int incY) { + float ret = 0; + for (unsigned int i = 0; i < N; ++i) { + ret += X[i * incX] * Y[i * incY]; + } + return ret; +} + static void scopy_raw(const unsigned int N, const float *X, const int incX, float *Y, const int incY) { unsigned int incy = abs(incY); @@ -186,6 +196,15 @@ float snrm2(const int N, const float *X, const int incX) { #endif } +float sdot(const unsigned int N, const float *X, const unsigned int incX, + const float *Y, const unsigned int incY) { +#ifdef USE_BLAS + return cblas_sdot(N, X, incX, Y, incY); +#else + return sdot_raw(N, X, incX, Y, incY); +#endif +} + void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M, const unsigned int N, const float alpha, const float *A, const unsigned int lda, const float *X, const int incX, diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index 3cc9b38..b09d945 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -36,7 +36,6 @@ enum CBLAS_TRANSPOSE { namespace nntrainer { -/* TODO : need to scopy, sscal, snrm2 */ void sscal(const int N, const float alpha, float *X, const int incX); float snrm2(const int N, const float *X, const int incX); @@ -44,6 +43,9 @@ float snrm2(const int N, const float *X, const int incX); void scopy(const unsigned int N, const float *X, const int incX, float *Y, const int intY); +float sdot(const unsigned int N, const float *X, const unsigned int incX, + const float *Y, const unsigned int incY); + void saxpy(const unsigned int N, const float alpha, const float *X, const int incX, float *Y, const int incY); diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index fde2996..fd3edf1 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -633,11 +633,37 @@ Tensor &Tensor::dot(Tensor const &m, Tensor &result, bool trans, float *rdata = result.getData(); const float alpha = 1.0f; const float beta = 0.0f; - enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans; enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans; - sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata, ldb, - beta, rdata, ldc); + + /// shortcut handling in case of vector + /// for vector, (1 * K) == (K * 1) in current memory layout... + /// and plaese note that N, K, M is a fixed place holder after considering + /// transpose. + /// For example, there is no case like (1 * K) X (1 * K) while + /// (1 * K) X (1 * M) can be a case + /// case1: (1 * K) X (K * 1) + if (M == 1 && N == 1) { + *rdata = sdot(K, data, 1, mdata, 1); + } + /// case2: (M * K) X (K * 1) + else if (N == 1) { + sgemv(CblasRowMajor, transA, dim1, dim2, alpha, data, lda, mdata, 1, beta, + rdata, 1); + } + /// case3: (1 * K) X (K * N) = 1 * N = R + /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K) + /// Effectively a translation of sgemv + else if (M == 1) { + transB = transB == CblasTrans ? CblasNoTrans : CblasTrans; + sgemv(CblasRowMajor, transB, mdim1, mdim2, alpha, mdata, ldb, data, 1, beta, + rdata, 1); + } + /// case others: use gemm + else { + sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, data, lda, mdata, ldb, + beta, rdata, ldc); + } return result; } diff --git a/test/unittest/unittest_nntrainer_tensor.cpp b/test/unittest/unittest_nntrainer_tensor.cpp index 85b2641..ba3d901 100644 --- a/test/unittest/unittest_nntrainer_tensor.cpp +++ b/test/unittest/unittest_nntrainer_tensor.cpp @@ -1956,6 +1956,249 @@ TEST(nntrainer_Tensor, dot_transpose_p) { } } +TEST(nntrainer_Tensor, dot_shortcuts_p) { + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 1, 3), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 1), b_data); + float answer_data[] = {5}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, false, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 1), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 1), b_data); + float answer_data[] = {5}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, true, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 1, 3), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 1, 3), b_data); + float answer_data[] = {5}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, false, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 1), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 1, 3), b_data); + float answer_data[] = {5}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, true, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2, 3, 4, 5}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 2, 3), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 1), b_data); + float answer_data[] = {5, 14}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 2, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, false, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 3, 1, 4, 2, 5}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 2), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 1), b_data); + float answer_data[] = {5, 14}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 2, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, true, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2, 3, 4, 5}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 2, 3), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 1, 3), b_data); + float answer_data[] = {5, 14}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 2, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, false, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 3, 1, 4, 2, 5}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 2), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 1, 3), b_data); + float answer_data[] = {5, 14}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 2, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, true, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 4, 3), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 1), b_data); + float answer_data[] = {5, 14, 23, 32}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 4, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, false, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 4), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 1), b_data); + float answer_data[] = {5, 14, 23, 32}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 4, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, true, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 4, 3), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 1, 3), b_data); + float answer_data[] = {5, 14, 23, 32}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 4, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, false, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 4), a_data); + float b_data[] = {0, 1, 2}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 1, 3), b_data); + float answer_data[] = {5, 14, 23, 32}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 4, 1), answer_data); + nntrainer::Tensor ret = a.dot(b, true, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 1, 3), a_data); + float b_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 4), b_data); + float answer_data[] = {20, 23, 26, 29}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 4), answer_data); + nntrainer::Tensor ret = a.dot(b, false, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 1), a_data); + float b_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 4), b_data); + float answer_data[] = {20, 23, 26, 29}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 4), answer_data); + nntrainer::Tensor ret = a.dot(b, true, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 1, 3), a_data); + float b_data[] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 4, 3), b_data); + float answer_data[] = {20, 23, 26, 29}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 4), answer_data); + nntrainer::Tensor ret = a.dot(b, false, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 1), a_data); + float b_data[] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 4, 3), b_data); + float answer_data[] = {20, 23, 26, 29}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 4), answer_data); + nntrainer::Tensor ret = a.dot(b, true, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 1, 3), a_data); + float b_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 4), b_data); + float answer_data[] = {20, 23, 26, 29}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 4), answer_data); + nntrainer::Tensor ret = a.dot(b, false, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 1), a_data); + float b_data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 4), b_data); + float answer_data[] = {20, 23, 26, 29}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 4), answer_data); + nntrainer::Tensor ret = a.dot(b, true, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 1, 3), a_data); + float b_data[] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 4, 3), b_data); + float answer_data[] = {20, 23, 26, 29}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 4), answer_data); + nntrainer::Tensor ret = a.dot(b, false, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 1), a_data); + float b_data[] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 4, 3), b_data); + float answer_data[] = {20, 23, 26, 29}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 4), answer_data); + nntrainer::Tensor ret = a.dot(b, true, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 1, 3), a_data); + float b_data[] = {0, 1, 2, 3, 4, 5}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 2), b_data); + float answer_data[] = {10, 13}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 2), answer_data); + nntrainer::Tensor ret = a.dot(b, false, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 1), a_data); + float b_data[] = {0, 1, 2, 3, 4, 5}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 3, 2), b_data); + float answer_data[] = {10, 13}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 2), answer_data); + nntrainer::Tensor ret = a.dot(b, true, false); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 1, 3), a_data); + float b_data[] = {0, 2, 4, 1, 3, 5}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 2, 3), b_data); + float answer_data[] = {10, 13}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 2), answer_data); + nntrainer::Tensor ret = a.dot(b, false, true); + EXPECT_EQ(ret, answer); + } + { + float a_data[] = {0, 1, 2}; + nntrainer::Tensor a(nntrainer::TensorDim(1, 1, 3, 1), a_data); + float b_data[] = {0, 2, 4, 1, 3, 5}; + nntrainer::Tensor b(nntrainer::TensorDim(1, 1, 2, 3), b_data); + float answer_data[] = {10, 13}; + nntrainer::Tensor answer(nntrainer::TensorDim(1, 1, 1, 2), answer_data); + nntrainer::Tensor ret = a.dot(b, true, true); + EXPECT_EQ(ret, answer); + } +} + TEST(nntrainer_Tensor, transpose_01_p) { int status = ML_ERROR_NONE; int batch = 3; -- 2.7.4