From a2d0cc878cb605c771cf7ca2979dcf9974ea5897 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Fri, 3 Jun 2016 10:38:30 +0300 Subject: [PATCH] Implement internal HAL for GEMM and matrix decompositions --- CMakeLists.txt | 5 + cmake/OpenCVFindLibsPerf.cmake | 13 + cmake/templates/cvconfig.h.in | 3 + modules/core/include/opencv2/core/hal/hal.hpp | 26 +- modules/core/include/opencv2/core/hal/interface.h | 15 + modules/core/include/opencv2/core/matx.hpp | 2 +- modules/core/src/hal_internal.cpp | 485 ++++++++++++++++++++++ modules/core/src/hal_internal.hpp | 96 +++++ modules/core/src/hal_replacement.hpp | 134 ++++++ modules/core/src/lapack.cpp | 37 +- modules/core/src/matmul.cpp | 248 ++++++++--- modules/core/src/matrix_decomp.cpp | 19 +- modules/imgproc/src/hal_replacement.hpp | 19 + 13 files changed, 1027 insertions(+), 75 deletions(-) create mode 100644 modules/core/src/hal_internal.cpp create mode 100644 modules/core/src/hal_internal.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index f0085b6..e5453c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,6 +221,7 @@ OCV_OPTION(WITH_VA "Include VA support" OFF OCV_OPTION(WITH_VA_INTEL "Include Intel VA-API/OpenCL support" OFF IF (UNIX AND NOT ANDROID) ) OCV_OPTION(WITH_GDAL "Include GDAL Support" OFF IF (NOT ANDROID AND NOT IOS AND NOT WINRT) ) OCV_OPTION(WITH_GPHOTO2 "Include gPhoto2 library support" ON IF (UNIX AND NOT ANDROID) ) +OCV_OPTION(WITH_LAPACK "Include Lapack library support" ON IF (UNIX AND NOT ANDROID) ) # OpenCV build components # =================================================== @@ -1157,6 +1158,10 @@ if(DEFINED WITH_VA_INTEL) status(" Use Intel VA-API/OpenCL:" HAVE_VA_INTEL THEN "YES (MSDK: ${VA_INTEL_MSDK_ROOT} OpenCL: ${VA_INTEL_IOCL_ROOT})" ELSE NO) endif(DEFINED WITH_VA_INTEL) +if(DEFINED WITH_LAPACK) +status(" Use Lapack:" HAVE_LAPACK THEN "YES" ELSE NO) +endif(DEFINED WITH_LAPACK) + status(" Use Eigen:" HAVE_EIGEN THEN "YES (ver ${EIGEN_WORLD_VERSION}.${EIGEN_MAJOR_VERSION}.${EIGEN_MINOR_VERSION})" ELSE NO) status(" Use Cuda:" HAVE_CUDA THEN "YES (ver ${CUDA_VERSION_STRING})" ELSE NO) status(" Use OpenCL:" HAVE_OPENCL THEN YES ELSE NO) diff --git a/cmake/OpenCVFindLibsPerf.cmake b/cmake/OpenCVFindLibsPerf.cmake index d1bc541..59ee42d 100644 --- a/cmake/OpenCVFindLibsPerf.cmake +++ b/cmake/OpenCVFindLibsPerf.cmake @@ -2,6 +2,19 @@ # Detect other 3rd-party performance and math libraries # ---------------------------------------------------------------------------- +# --- Lapack --- +if(WITH_LAPACK) + find_package(LAPACK) + if(LAPACK_FOUND) + find_path(LAPACK_INCLUDE_DIR "lapacke.h") + if(LAPACK_INCLUDE_DIR) + set(HAVE_LAPACK 1) + ocv_include_directories(${LAPACK_INCLUDE_DIR}) + list(APPEND OPENCV_LINKER_LIBS ${LAPACK_LIBRARIES}) + endif() + endif(LAPACK_FOUND) +endif(WITH_LAPACK) + # --- TBB --- if(WITH_TBB) include("${OpenCV_SOURCE_DIR}/cmake/OpenCVDetectTBB.cmake") diff --git a/cmake/templates/cvconfig.h.in b/cmake/templates/cvconfig.h.in index c80720d..b86d44a 100644 --- a/cmake/templates/cvconfig.h.in +++ b/cmake/templates/cvconfig.h.in @@ -197,3 +197,6 @@ /* Intel VA-API/OpenCL */ #cmakedefine HAVE_VA_INTEL + +/* Lapack */ +#cmakedefine HAVE_LAPACK diff --git a/modules/core/include/opencv2/core/hal/hal.hpp b/modules/core/include/opencv2/core/hal/hal.hpp index 09bcd72..3829418 100644 --- a/modules/core/include/opencv2/core/hal/hal.hpp +++ b/modules/core/include/opencv2/core/hal/hal.hpp @@ -49,17 +49,6 @@ #include "opencv2/core/cvstd.hpp" #include "opencv2/core/hal/interface.h" -//! @cond IGNORED -#define CALL_HAL(name, fun, ...) \ - int res = fun(__VA_ARGS__); \ - if (res == CV_HAL_ERROR_OK) \ - return; \ - else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \ - CV_Error_(cv::Error::StsInternal, \ - ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); -//! @endcond - - namespace cv { namespace hal { //! @addtogroup core_hal_functions @@ -75,6 +64,21 @@ CV_EXPORTS int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int CV_EXPORTS int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n); CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n); CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n); +CV_EXPORTS void SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int flags); +CV_EXPORTS void SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int flags); + +CV_EXPORTS void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags); +CV_EXPORTS void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags); +CV_EXPORTS void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags); +CV_EXPORTS void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags); CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n); CV_EXPORTS float normL1_(const float* a, const float* b, int n); diff --git a/modules/core/include/opencv2/core/hal/interface.h b/modules/core/include/opencv2/core/hal/interface.h index 2bb7b19..4a97e65 100644 --- a/modules/core/include/opencv2/core/hal/interface.h +++ b/modules/core/include/opencv2/core/hal/interface.h @@ -158,6 +158,21 @@ typedef signed char schar; #define CV_HAL_DFT_IS_INPLACE 1024 //! @} +//! @name SVD flags +//! @{ +#define CV_HAL_SVD_NO_UV 1 +#define CV_HAL_SVD_SHORT_UV 2 +#define CV_HAL_SVD_MODIFY_A 4 +#define CV_HAL_SVD_FULL_UV 8 +//! @} + +//! @name Gemm flags +//! @{ +#define CV_HAL_GEMM_1_T 1 +#define CV_HAL_GEMM_2_T 2 +#define CV_HAL_GEMM_3_T 4 +//! @} + //! @} #endif diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp index e4d72f7..a4c0fbb 100644 --- a/modules/core/include/opencv2/core/matx.hpp +++ b/modules/core/include/opencv2/core/matx.hpp @@ -438,7 +438,7 @@ template struct Matx_DetOp return p; for( int i = 0; i < m; i++ ) p *= temp(i, i); - return 1./p; + return p; } }; diff --git a/modules/core/src/hal_internal.cpp b/modules/core/src/hal_internal.cpp new file mode 100644 index 0000000..096ac0b --- /dev/null +++ b/modules/core/src/hal_internal.cpp @@ -0,0 +1,485 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "hal_internal.hpp" + +#ifdef HAVE_LAPACK + +#include +#include +#include +#include +#include +#include +#include + +#define HAL_GEMM_SMALL_COMPLEX_MATRIX_THRESH 100 +#define HAL_GEMM_SMALL_MATRIX_THRESH 100 +#define HAL_SVD_SMALL_MATRIX_THRESH 25 +#define HAL_LU_SMALL_MATRIX_THRESH 100 +#define HAL_CHOLESKY_SMALL_MATRIX_THRESH 100 + +//lapack stores matrices in column-major order so transposing is neded everywhere +template static inline void +transpose_square_inplace(fptype *src, size_t src_ld, size_t m) +{ + for(size_t i = 0; i < m - 1; i++) + for(size_t j = i + 1; j < m; j++) + std::swap(src[j*src_ld + i], src[i*src_ld + j]); +} + +template static inline void +transpose(const fptype *src, size_t src_ld, fptype* dst, size_t dst_ld, size_t m, size_t n) +{ + for(size_t i = 0; i < m; i++) + for(size_t j = 0; j < n; j++) + dst[j*dst_ld + i] = src[i*src_ld + j]; +} + +template static inline void +copy_matrix(const fptype *src, size_t src_ld, fptype* dst, size_t dst_ld, size_t m, size_t n) +{ + for(size_t i = 0; i < m; i++) + for(size_t j = 0; j < n; j++) + dst[i*dst_ld + j] = src[i*src_ld + j]; +} + +template static inline void +set_value(fptype *dst, size_t dst_ld, fptype value, size_t m, size_t n) +{ + for(size_t i = 0; i < m; i++) + for(size_t j = 0; j < n; j++) + dst[i*dst_ld + j] = value; +} + +template static inline int +lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int* info) +{ + int lda = a_step / sizeof(fptype), sign = 0; + int* piv = new int[m]; + + transpose_square_inplace(a, lda, m); + + if(b) + { + if(n == 1 && b_step == sizeof(fptype)) + { + if(typeid(fptype) == typeid(float)) + sgesv_(&m, &n, (float*)a, &lda, piv, (float*)b, &m, info); + else if(typeid(fptype) == typeid(double)) + dgesv_(&m, &n, (double*)a, &lda, piv, (double*)b, &m, info); + } + else + { + int ldb = b_step / sizeof(fptype); + fptype* tmpB = new fptype[m*n]; + + transpose(b, ldb, tmpB, m, m, n); + + if(typeid(fptype) == typeid(float)) + sgesv_(&m, &n, (float*)a, &lda, piv, (float*)tmpB, &m, info); + else if(typeid(fptype) == typeid(double)) + dgesv_(&m, &n, (double*)a, &lda, piv, (double*)tmpB, &m, info); + + transpose(tmpB, m, b, ldb, n, m); + delete[] tmpB; + } + } + else + { + if(typeid(fptype) == typeid(float)) + sgetrf_(&m, &m, (float*)a, &lda, piv, info); + else if(typeid(fptype) == typeid(double)) + dgetrf_(&m, &m, (double*)a, &lda, piv, info); + } + + if(*info == 0) + { + for(int i = 0; i < m; i++) + sign ^= piv[i] != i + 1; + *info = sign ? -1 : 1; + } + else + *info = 0; //in opencv LU function zero means error + + delete[] piv; + return CV_HAL_ERROR_OK; +} + +template static inline int +lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, bool* info) +{ + int lapackStatus; + int lda = a_step / sizeof(fptype); + char L[] = {'L', '\0'}; + + if(b) + { + if(n == 1 && b_step == sizeof(fptype)) + { + if(typeid(fptype) == typeid(float)) + sposv_(L, &m, &n, (float*)a, &lda, (float*)b, &m, &lapackStatus); + else if(typeid(fptype) == typeid(double)) + dposv_(L, &m, &n, (double*)a, &lda, (double*)b, &m, &lapackStatus); + } + else + { + int ldb = b_step / sizeof(fptype); + fptype* tmpB = new fptype[m*n]; + transpose(b, ldb, tmpB, m, m, n); + + if(typeid(fptype) == typeid(float)) + sposv_(L, &m, &n, (float*)a, &lda, (float*)tmpB, &m, &lapackStatus); + else if(typeid(fptype) == typeid(double)) + dposv_(L, &m, &n, (double*)a, &lda, (double*)tmpB, &m, &lapackStatus); + + transpose(tmpB, m, b, ldb, n, m); + delete[] tmpB; + } + } + else + { + if(typeid(fptype) == typeid(float)) + spotrf_(L, &m, (float*)a, &lda, &lapackStatus); + else if(typeid(fptype) == typeid(double)) + dpotrf_(L, &m, (double*)a, &lda, &lapackStatus); + } + + if(lapackStatus == 0) *info = true; + else *info = false; //in opencv Cholesky function false means error + + return CV_HAL_ERROR_OK; +} + +template static inline int +lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype* vt, size_t v_step, int m, int n, int flags, int* info) +{ + int lda = a_step / sizeof(fptype); + int ldv = v_step / sizeof(fptype); + int ldu = u_step / sizeof(fptype); + int lwork = -1; + int* iworkBuf = new int[8*std::min(m, n)]; + fptype work1 = 0; + + //A already transposed and m>=n + char mode[] = { ' ', '\0'}; + if(flags & CV_HAL_SVD_NO_UV) + { + ldv = 1; + mode[0] = 'N'; + } + else if((flags & CV_HAL_SVD_SHORT_UV) && (flags & CV_HAL_SVD_MODIFY_A)) //short SVD, U stored in a + mode[0] = 'O'; + else if((flags & CV_HAL_SVD_SHORT_UV) && !(flags & CV_HAL_SVD_MODIFY_A)) //short SVD, U stored in u if m>=n + mode[0] = 'S'; + else if(flags & CV_HAL_SVD_FULL_UV) //full SVD, U stored in u or in a + mode[0] = 'A'; + + if((flags & CV_HAL_SVD_MODIFY_A) && (flags & CV_HAL_SVD_FULL_UV)) //U stored in a + { + u = new fptype[m*m]; + ldu = m; + } + + if(typeid(fptype) == typeid(float)) + sgesdd_(mode, &m, &n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)&work1, &lwork, iworkBuf, info); + else if(typeid(fptype) == typeid(double)) + dgesdd_(mode, &m, &n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)&work1, &lwork, iworkBuf, info); + + lwork = round(work1); //optimal buffer size + fptype* buffer = new fptype[lwork + 1]; + + if(typeid(fptype) == typeid(float)) + sgesdd_(mode, &m, &n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)buffer, &lwork, iworkBuf, info); + else if(typeid(fptype) == typeid(double)) + dgesdd_(mode, &m, &n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)buffer, &lwork, iworkBuf, info); + + if(!(flags & CV_HAL_SVD_NO_UV)) + transpose_square_inplace(vt, ldv, n); + + if((flags & CV_HAL_SVD_MODIFY_A) && (flags & CV_HAL_SVD_FULL_UV)) + { + for(int i = 0; i < m; i++) + for(int j = 0; j < m; j++) + a[i*lda + j] = u[i*m + j]; + delete[] u; + } + + delete[] iworkBuf; + delete[] buffer; + return CV_HAL_ERROR_OK; +} + +template static inline int +lapack_gemm(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha, + const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags) +{ + int ldsrc1 = src1_step / sizeof(fptype); + int ldsrc2 = src2_step / sizeof(fptype); + int ldsrc3 = src3_step / sizeof(fptype); + int lddst = dst_step / sizeof(fptype); + int c_m, c_n, d_m; + CBLAS_TRANSPOSE transA, transB; + + if(flags & CV_HAL_GEMM_2_T) + { + transB = CblasTrans; + if(flags & CV_HAL_GEMM_1_T ) + { + d_m = a_n; + } + else + { + d_m = a_m; + } + } + else + { + transB = CblasNoTrans; + if(flags & CV_HAL_GEMM_1_T ) + { + d_m = a_n; + } + else + { + d_m = a_m; + } + } + + if(flags & CV_HAL_GEMM_3_T) + { + c_m = d_n; + c_n = d_m; + } + else + { + c_m = d_m; + c_n = d_n; + } + + if(flags & CV_HAL_GEMM_1_T ) + { + transA = CblasTrans; + std::swap(a_n, a_m); + } + else + { + transA = CblasNoTrans; + } + + if(src3 != dst && beta != 0.0 && src3_step != 0) { + if(flags & CV_HAL_GEMM_3_T) + transpose(src3, ldsrc3, dst, lddst, c_m, c_n); + else + copy_matrix(src3, ldsrc3, dst, lddst, c_m, c_n); + } + else if (src3 == dst && (flags & CV_HAL_GEMM_3_T)) //actually transposing C in this case done by openCV + return CV_HAL_ERROR_NOT_IMPLEMENTED; + else if(src3_step == 0 && beta != 0.0) + set_value(dst, lddst, (fptype)0.0, d_m, d_n); + + if(typeid(fptype) == typeid(float)) + cblas_sgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (float)alpha, (float*)src1, ldsrc1, (float*)src2, ldsrc2, (float)beta, (float*)dst, lddst); + else if(typeid(fptype) == typeid(double)) + cblas_dgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (double)alpha, (double*)src1, ldsrc1, (double*)src2, ldsrc2, (double)beta, (double*)dst, lddst); + + return CV_HAL_ERROR_OK; +} + + +template static inline int +lapack_gemm_c(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha, + const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags) +{ + int ldsrc1 = src1_step / sizeof(std::complex); + int ldsrc2 = src2_step / sizeof(std::complex); + int ldsrc3 = src3_step / sizeof(std::complex); + int lddst = dst_step / sizeof(std::complex); + int c_m, c_n, d_m; + CBLAS_TRANSPOSE transA, transB; + std::complex cAlpha(alpha, 0.0); + std::complex cBeta(beta, 0.0); + + if(flags & CV_HAL_GEMM_2_T) + { + transB = CblasTrans; + if(flags & CV_HAL_GEMM_1_T ) + { + d_m = a_n; + } + else + { + d_m = a_m; + } + } + else + { + transB = CblasNoTrans; + if(flags & CV_HAL_GEMM_1_T ) + { + d_m = a_n; + } + else + { + d_m = a_m; + } + } + + if(flags & CV_HAL_GEMM_3_T) + { + c_m = d_n; + c_n = d_m; + } + else + { + c_m = d_m; + c_n = d_n; + } + + if(flags & CV_HAL_GEMM_1_T ) + { + transA = CblasTrans; + std::swap(a_n, a_m); + } + else + { + transA = CblasNoTrans; + } + + if(src3 != dst && beta != 0.0 && src3_step != 0) { + if(flags & CV_HAL_GEMM_3_T) + transpose((std::complex*)src3, ldsrc3, (std::complex*)dst, lddst, c_m, c_n); + else + copy_matrix((std::complex*)src3, ldsrc3, (std::complex*)dst, lddst, c_m, c_n); + } + else if (src3 == dst && (flags & CV_HAL_GEMM_3_T)) //actually transposing C in this case done by openCV + return CV_HAL_ERROR_NOT_IMPLEMENTED; + else if(src3_step == 0 && beta != 0.0) + set_value((std::complex*)dst, lddst, std::complex(0.0, 0.0), d_m, d_n); + + if(typeid(fptype) == typeid(float)) + cblas_cgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, &cAlpha, (void*)src1, ldsrc1, (void*)src2, ldsrc2, &cBeta, (void*)dst, lddst); + else if(typeid(fptype) == typeid(double)) + cblas_zgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, &cAlpha, (void*)src1, ldsrc1, (void*)src2, ldsrc2, &cBeta, (void*)dst, lddst); + + return CV_HAL_ERROR_OK; +} +int lapack_LU32f(float* a, size_t a_step, int m, float* b, size_t b_step, int n, int* info) +{ + if(m < HAL_LU_SMALL_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return lapack_LU(a, a_step, m, b, b_step, n, info); +} + +int lapack_LU64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, int* info) +{ + if(m < HAL_LU_SMALL_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return lapack_LU(a, a_step, m, b, b_step, n, info); +} + +int lapack_Cholesky32f(float* a, size_t a_step, int m, float* b, size_t b_step, int n, bool *info) +{ + if(m < HAL_CHOLESKY_SMALL_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return lapack_Cholesky(a, a_step, m, b, b_step, n, info); +} + +int lapack_Cholesky64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, bool *info) +{ + if(m < HAL_CHOLESKY_SMALL_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return lapack_Cholesky(a, a_step, m, b, b_step, n, info); +} + +int lapack_SVD32f(float* a, size_t a_step, float *w, float* u, size_t u_step, float* vt, size_t v_step, int m, int n, int flags) +{ + + if(m < HAL_SVD_SMALL_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + int info; + return lapack_SVD(a, a_step, w, u, u_step, vt, v_step, m, n, flags, &info); +} + +int lapack_SVD64f(double* a, size_t a_step, double *w, double* u, size_t u_step, double* vt, size_t v_step, int m, int n, int flags) +{ + + if(m < HAL_SVD_SMALL_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + int info; + return lapack_SVD(a, a_step, w, u, u_step, vt, v_step, m, n, flags, &info); +} + +int lapack_gemm32f(const float *src1, size_t src1_step, const float *src2, size_t src2_step, float alpha, + const float *src3, size_t src3_step, float beta, float *dst, size_t dst_step, int m, int n, int k, int flags) +{ + if(m < HAL_GEMM_SMALL_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return lapack_gemm(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m, n, k, flags); +} + +int lapack_gemm64f(const double *src1, size_t src1_step, const double *src2, size_t src2_step, double alpha, + const double *src3, size_t src3_step, double beta, double *dst, size_t dst_step, int m, int n, int k, int flags) +{ + if(m < HAL_GEMM_SMALL_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return lapack_gemm(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m, n, k, flags); +} + +int lapack_gemm32fc(const float *src1, size_t src1_step, const float *src2, size_t src2_step, float alpha, + const float *src3, size_t src3_step, float beta, float *dst, size_t dst_step, int m, int n, int k, int flags) +{ + if(m < HAL_GEMM_SMALL_COMPLEX_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return lapack_gemm_c(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m, n, k, flags); +} +int lapack_gemm64fc(const double *src1, size_t src1_step, const double *src2, size_t src2_step, double alpha, + const double *src3, size_t src3_step, double beta, double *dst, size_t dst_step, int m, int n, int k, int flags) +{ + if(m < HAL_GEMM_SMALL_COMPLEX_MATRIX_THRESH) + return CV_HAL_ERROR_NOT_IMPLEMENTED; + return lapack_gemm_c(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m, n, k, flags); +} + +#endif //HAVE_LAPACK diff --git a/modules/core/src/hal_internal.hpp b/modules/core/src/hal_internal.hpp new file mode 100644 index 0000000..26bc142 --- /dev/null +++ b/modules/core/src/hal_internal.hpp @@ -0,0 +1,96 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Copyright (C) 2015, Itseez Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef OPENCV_CORE_HAL_INTERNAL_HPP +#define OPENCV_CORE_HAL_INTERNAL_HPP + +#include "precomp.hpp" + +#ifdef HAVE_LAPACK + +int lapack_LU32f(float* a, size_t a_step, int m, float* b, size_t b_step, int n, int* info); +int lapack_LU64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, int* info); +int lapack_Cholesky32f(float* a, size_t a_step, int m, float* b, size_t b_step, int n, bool* info); +int lapack_Cholesky64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, bool* info); +int lapack_SVD32f(float* a, size_t a_step, float* w, float* u, size_t u_step, float* vt, size_t v_step, int m, int n, int flags); +int lapack_SVD64f(double* a, size_t a_step, double* w, double* u, size_t u_step, double* vt, size_t v_step, int m, int n, int flags); +int lapack_gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m, int n, int k, int flags); +int lapack_gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m, int n, int k, int flags); +int lapack_gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m, int n, int k, int flags); +int lapack_gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m, int n, int k, int flags); + +#undef cv_hal_LU32f +#define cv_hal_LU32f lapack_LU32f +#undef cv_hal_LU64f +#define cv_hal_LU64f lapack_LU64f + +#undef cv_hal_Cholesky32f +#define cv_hal_Cholesky32f lapack_Cholesky32f +#undef cv_hal_Cholesky64f +#define cv_hal_Cholesky64f lapack_Cholesky64f + +#undef cv_hal_SVD32f +#define cv_hal_SVD32f lapack_SVD32f +#undef cv_hal_SVD64f +#define cv_hal_SVD64f lapack_SVD64f + +#undef cv_hal_gemm32f +#define cv_hal_gemm32f lapack_gemm32f +#undef cv_hal_gemm64f +#define cv_hal_gemm64f lapack_gemm64f +#undef cv_hal_gemm32fc +#define cv_hal_gemm32fc lapack_gemm32fc +#undef cv_hal_gemm64fc +#define cv_hal_gemm64fc lapack_gemm64fc + +#endif //HAVE_LAPACK +#endif //OPENCV_CORE_HAL_INTERNAL_HPP diff --git a/modules/core/src/hal_replacement.hpp b/modules/core/src/hal_replacement.hpp index 93476c4..605ae38 100644 --- a/modules/core/src/hal_replacement.hpp +++ b/modules/core/src/hal_replacement.hpp @@ -472,14 +472,148 @@ inline int hal_ni_dctFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME #define cv_hal_dctFree2D hal_ni_dctFree2D //! @endcond + +/** +Performs \f$LU\f$ decomposition of square matrix \f$A=P*L*U\f$ (where \f$P\f$ is permutation matrix) and solves matrix equation \f$A*X=B\f$. +Function returns the \f$sign\f$ of permutation \f$P\f$ via parameter info. +@param src1 pointer to input matrix \f$A\f$ stored in row major order. After finish of work src1 contains at least \f$U\f$ part of \f$LU\f$ +decomposition which is appropriate for determainant calculation: \f$det(A)=sign*\prod_{j=1}^{M}a_{jj}\f$. +@param src1_step number of bytes each matrix \f$A\f$ row occupies. +@param m size of square matrix \f$A\f$. +@param src2 pointer to \f$M\times N\f$ matrix \f$B\f$ which is the right-hand side of system \f$A*X=B\f$. \f$B\f$ stored in row major order. +If src2 is null pointer only \f$LU\f$ decomposition will be performed. After finish of work src2 contains solution \f$X\f$ of system \f$A*X=B\f$. +@param src2_step number of bytes each matrix \f$B\f$ row occupies. +@param n number of right-hand vectors in \f$M\times N\f$ matrix \f$B\f$. +@param info indicates success of decomposition. If *info is equals to zero decomposition failed, othervise *info is equals to \f$sign\f$. + */ +//! @addtogroup core_hal_interface_decomp_lu LU matrix decomposition +//! @{ +inline int hal_ni_LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +//! @} + +/** +Performs Cholesky decomposition of matrix \f$A = L*L^T\f$ and solves matrix equation \f$A*X=B\f$. +@param src1 pointer to input matrix \f$A\f$ stored in row major order. After finish of work src1 contains lower triangular matrix \f$L\f$. +@param src1_step number of bytes each matrix \f$A\f$ row occupies. +@param m size of square matrix \f$A\f$. +@param src2 pointer to \f$M\times N\f$ matrix \f$B\f$ which is the right-hand side of system \f$A*X=B\f$. B stored in row major order. +If src2 is null pointer only Cholesky decomposition will be performed. After finish of work src2 contains solution \f$X\f$ of system \f$A*X=B\f$. +@param src2_step number of bytes each matrix \f$B\f$ row occupies. +@param n number of right-hand vectors in \f$M\times N\f$ matrix \f$B\f$. +@param info indicates success of decomposition. If *info is false decomposition failed. + */ + +//! @addtogroup core_hal_interface_decomp_cholesky Cholesky matrix decomposition +//! @{ +inline int hal_ni_Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +//! @} + +/** +Performs singular value decomposition of \f$M\times N\f$(\f$M>N\f$) matrix \f$A = U*\Sigma*V^T\f$. +@param src pointer to input \f$M\times N\f$ matrix \f$A\f$ stored in column major order. +After finish of work src will be filled with rows of \f$U\f$ or not modified (depends of flag CV_HAL_SVD_MODIFY_A). +@param src_step number of bytes each matrix \f$A\f$ column occupies. +@param w pointer to array for singular values of matrix \f$A\f$ (i. e. first \f$N\f$ diagonal elements of matrix \f$\Sigma\f$). +@param u pointer to output \f$M\times N\f$ or \f$M\times M\f$ matrix \f$U\f$ (size depends of flags). Pointer must be valid if flag CV_HAL_SVD_MODIFY_A not used. +@param u_step number of bytes each matrix \f$U\f$ row occupies. +@param vt pointer to array for \f$N\times N\f$ matrix \f$V^T\f$. +@param vt_step number of bytes each matrix \f$V^T\f$ row occupies. +@param m number fo rows in matrix \f$A\f$. +@param n number of columns in matrix \f$A\f$. +@param flags algorithm options (combination of CV_HAL_SVD_FULL_UV, ...). + */ +//! @addtogroup core_hal_interface_decomp_svd Singular value matrix decomposition +//! @{ +inline int hal_ni_SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +//! @} + + + +//! @cond IGNORED +#define cv_hal_LU32f hal_ni_LU32f +#define cv_hal_LU64f hal_ni_LU64f +#define cv_hal_Cholesky32f hal_ni_Cholesky32f +#define cv_hal_Cholesky64f hal_ni_Cholesky64f +#define cv_hal_SVD32f hal_ni_SVD32f +#define cv_hal_SVD64f hal_ni_SVD64f +//! @endcond + + +/** +The function performs generalized matrix multiplication similar to the gemm functions in BLAS level 3: +\f$D = \alpha*AB+\beta*C\f$ + +@param src1 pointer to input \f$M\times N\f$ matrix \f$A\f$ or \f$A^T\f$ stored in row major order. +@param src1_step number of bytes each matrix \f$A\f$ or \f$A^T\f$ row occupies. +@param src2 pointer to input \f$N\times K\f$ matrix \f$B\f$ or \f$B^T\f$ stored in row major order. +@param src2_step number of bytes each matrix \f$B\f$ or \f$B^T\f$ row occupies. +@param alpha \f$\alpha\f$ multiplier before \f$AB\f$ +@param src3 pointer to input \f$M\times K\f$ matrix \f$C\f$ or \f$C^T\f$ stored in row major order. +@param src3_step number of bytes each matrix \f$C\f$ or \f$C^T\f$ row occupies. +@param beta \f$\beta\f$ multiplier before \f$C\f$ +@param dst pointer to input \f$M\times K\f$ matrix \f$D\f$ stored in row major order. +@param dst_step number of bytes each matrix \f$D\f$ row occupies. +@param m number of rows in matrix \f$A\f$ or \f$A^T\f$, equals to number of rows in matrix \f$D\f$ +@param n number of columns in matrix \f$A\f$ or \f$A^T\f$ +@param k number of columns in matrix \f$B\f$ or \f$B^T\f$, equals to number of columns in matrix \f$D\f$ +@param flags algorithm options (combination of CV_HAL_GEMM_1_T, ...). + */ + +//! @addtogroup core_hal_interface_matrix_multiplication Matrix multiplication +//! @{ +inline int hal_ni_gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } +inline int hal_ni_gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; } //! @} +//! @cond IGNORED +#define cv_hal_gemm32f hal_ni_gemm32f +#define cv_hal_gemm64f hal_ni_gemm64f +#define cv_hal_gemm32fc hal_ni_gemm32fc +#define cv_hal_gemm64fc hal_ni_gemm64fc +//! @endcond + +//! @} + + #if defined __GNUC__ # pragma GCC diagnostic pop #elif defined _MSC_VER # pragma warning( pop ) #endif +#include "hal_internal.hpp" #include "custom_hal.hpp" +//! @cond IGNORED +#define CALL_HAL_RET(name, fun, retval, ...) \ + int res = fun(__VA_ARGS__, &retval); \ + if (res == CV_HAL_ERROR_OK) \ + return retval; \ + else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \ + CV_Error_(cv::Error::StsInternal, \ + ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); + + +#define CALL_HAL(name, fun, ...) \ + int res = fun(__VA_ARGS__); \ + if (res == CV_HAL_ERROR_OK) \ + return; \ + else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \ + CV_Error_(cv::Error::StsInternal, \ + ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); +//! @endcond + #endif diff --git a/modules/core/src/lapack.cpp b/modules/core/src/lapack.cpp index 4fcf3c7..3711ca9 100644 --- a/modules/core/src/lapack.cpp +++ b/modules/core/src/lapack.cpp @@ -570,11 +570,44 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep, static void JacobiSVD(float* At, size_t astep, float* W, float* Vt, size_t vstep, int m, int n, int n1=-1) { - JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, FLT_MIN, FLT_EPSILON*2); + hal::SVD32f(At, astep, W, NULL, astep, Vt, vstep, m, n, n1); } static void JacobiSVD(double* At, size_t astep, double* W, double* Vt, size_t vstep, int m, int n, int n1=-1) { + hal::SVD64f(At, astep, W, NULL, astep, Vt, vstep, m, n, n1); +} + +template static inline int +decodeSVDParameters(const fptype* U, const fptype* Vt, int m, int n, int n1) +{ + int halSVDFlag = 0; + if(Vt == NULL) + halSVDFlag = CV_HAL_SVD_NO_UV; + else if(n1 <= 0 || n1 == n) + { + halSVDFlag = CV_HAL_SVD_SHORT_UV; + if(U == NULL) + halSVDFlag |= CV_HAL_SVD_MODIFY_A; + } + else if(n1 == m) + { + halSVDFlag = CV_HAL_SVD_FULL_UV; + if(U == NULL) + halSVDFlag |= CV_HAL_SVD_MODIFY_A; + } + return halSVDFlag; +} + +void hal::SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int n1) +{ + CALL_HAL(SVD32f, cv_hal_SVD32f, At, astep, W, U, ustep, Vt, vstep, m, n, decodeSVDParameters(U, Vt, m, n, n1)) + JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, FLT_MIN, FLT_EPSILON*2); +} + +void hal::SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int n1) +{ + CALL_HAL(SVD64f, cv_hal_SVD64f, At, astep, W, U, ustep, Vt, vstep, m, n, decodeSVDParameters(U, Vt, m, n, n1)) JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, DBL_MIN, DBL_EPSILON*10); } @@ -745,7 +778,6 @@ double cv::determinant( InputArray _mat ) { for( int i = 0; i < rows; i++ ) result *= a.at(i,i); - result = 1./result; } } } @@ -769,7 +801,6 @@ double cv::determinant( InputArray _mat ) { for( int i = 0; i < rows; i++ ) result *= a.at(i,i); - result = 1./result; } } } diff --git a/modules/core/src/matmul.cpp b/modules/core/src/matmul.cpp index cf83ece..4b0690a 100644 --- a/modules/core/src/matmul.cpp +++ b/modules/core/src/matmul.cpp @@ -864,73 +864,39 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha, return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false); } #endif -} -void cv::gemm( InputArray matA, InputArray matB, double alpha, - InputArray matC, double beta, OutputArray _matD, int flags ) +static void _gemmImplInternal( Mat A, Mat B, double alpha, + Mat C, double beta, Mat D, int flags ) { -#ifdef HAVE_CLAMDBLAS - CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() && - matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes - ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags)) -#endif - -#ifdef HAVE_OPENCL - CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2, - ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags)) -#endif - const int block_lin_size = 128; const int block_size = block_lin_size * block_lin_size; static double zero[] = {0,0,0,0}; static float zerof[] = {0,0,0,0}; - Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0 ? matC.getMat() : Mat(); Size a_size = A.size(), d_size; int i, len = 0, type = A.type(); - CV_Assert( type == B.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); - switch( flags & (GEMM_1_T|GEMM_2_T) ) { case 0: d_size = Size( B.cols, a_size.height ); len = B.rows; - CV_Assert( a_size.width == len ); break; case 1: d_size = Size( B.cols, a_size.width ); len = B.rows; - CV_Assert( a_size.height == len ); break; case 2: d_size = Size( B.rows, a_size.height ); len = B.cols; - CV_Assert( a_size.width == len ); break; case 3: d_size = Size( B.rows, a_size.width ); len = B.cols; - CV_Assert( a_size.height == len ); break; } - if( !C.empty() ) - { - CV_Assert( C.type() == type && - (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) || - ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height))); - } - - _matD.create( d_size.height, d_size.width, type ); - Mat D = _matD.getMat(); - if( (flags & GEMM_3_T) != 0 && C.data == D.data ) - { - transpose( C, C ); - flags &= ~GEMM_3_T; - } - if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) ) { if( type == CV_32F ) @@ -1194,8 +1160,7 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, GEMMSingleMulFunc singleMulFunc; GEMMBlockMulFunc blockMulFunc; GEMMStoreFunc storeFunc; - Mat *matD = &D, tmat; - size_t tmat_size = 0; + Mat *matD = &D; const uchar* Cdata = C.data; size_t Cstep = C.data ? (size_t)C.step : 0; AutoBuffer buf; @@ -1226,13 +1191,6 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, storeFunc = (GEMMStoreFunc)GEMMStore_64fc; } - if( D.data == A.data || D.data == B.data ) - { - tmat_size = (size_t)d_size.width*d_size.height*CV_ELEM_SIZE(type); - // Allocate tmat later, once the size of buf is known - matD = &tmat; - } - if( (d_size.width == 1 || len == 1) && !(flags & GEMM_2_T) && B.isContinuous() ) { b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type); @@ -1306,10 +1264,6 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, (d_size.width <= block_lin_size && d_size.height <= block_lin_size && len <= block_lin_size) ) { - if( tmat_size > 0 ) { - buf.allocate(tmat_size); - tmat = Mat(d_size.height, d_size.width, type, (uchar*)buf ); - } singleMulFunc( A.ptr(), A.step, B.ptr(), b_step, Cdata, Cstep, matD->ptr(), matD->step, a_size, d_size, alpha, beta, flags ); } @@ -1369,14 +1323,12 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, flags &= ~GEMM_1_T; } - buf.allocate(d_buf_size + b_buf_size + a_buf_size + tmat_size); + buf.allocate(d_buf_size + b_buf_size + a_buf_size); d_buf = (uchar*)buf; b_buf = d_buf + d_buf_size; if( is_a_t ) a_buf = b_buf + b_buf_size; - if( tmat_size > 0 ) - tmat = Mat(d_size.height, d_size.width, type, b_buf + b_buf_size + a_buf_size ); for( i = 0; i < d_size.height; i += di ) { @@ -1455,10 +1407,198 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha, } } } + } +} + +template inline static void +_callInternalGemmImpl(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha, + const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int m_a, int n_a, int n_d, int flags, int type) +{ + CV_StaticAssert(GEMM_1_T == CV_HAL_GEMM_1_T, "Incompatible GEMM_1_T flag in HAL"); + CV_StaticAssert(GEMM_2_T == CV_HAL_GEMM_2_T, "Incompatible GEMM_2_T flag in HAL"); + CV_StaticAssert(GEMM_3_T == CV_HAL_GEMM_3_T, "Incompatible GEMM_3_T flag in HAL"); + + int b_m, b_n, c_m, c_n, m_d; + + if(flags & GEMM_2_T) + { + b_m = n_d; + if(flags & GEMM_1_T ) + { + b_n = m_a; + m_d = n_a; + } + else + { + b_n = n_a; + m_d = m_a; + } + } + else + { + b_n = n_d; + if(flags & GEMM_1_T ) + { + b_m = m_a; + m_d = n_a; + } + else + { + m_d = m_a; + b_m = n_a; + } + } - if( matD != &D ) - matD->copyTo(D); + if(flags & GEMM_3_T) + { + c_m = n_d; + c_n = m_d; + } + else + { + c_m = m_d; + c_n = n_d; } + + Mat A, B, C; + if(src1 != NULL) + A = Mat(m_a, n_a, type, (void*)src1, src1_step); + if(src2 != NULL) + B = Mat(b_m, b_n, type, (void*)src2, src2_step); + if(src3 != NULL && beta != 0.0) + C = Mat(c_m, c_n, type, (void*)src3, src3_step); + Mat D(m_d, n_d, type, (void*)dst, dst_step); + + _gemmImplInternal(A, B, alpha, C, beta, D, flags); +} + +} + +void cv::hal::gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) +{ + + CALL_HAL(gemm32f, cv_hal_gemm32f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) + _callInternalGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32F); +} + +void cv::hal::gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) +{ + CALL_HAL(gemm64f, cv_hal_gemm64f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) + _callInternalGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64F); +} + +CV_EXPORTS void cv::hal::gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step, + float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) +{ + CALL_HAL(gemm32fc, cv_hal_gemm32fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) + _callInternalGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32FC2); +} + +CV_EXPORTS void cv::hal::gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step, + double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step, + int m_a, int n_a, int n_d, int flags) +{ + CALL_HAL(gemm64fc, cv_hal_gemm64fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags) + _callInternalGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64FC2); +} + +void cv::gemm( InputArray matA, InputArray matB, double alpha, + InputArray matC, double beta, OutputArray _matD, int flags ) +{ +#ifdef HAVE_CLAMDBLAS + CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() && + matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes + ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags)) +#endif + +#ifdef HAVE_OPENCL + CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2, + ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags)) +#endif + + Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0.0 ? matC.getMat() : Mat(); + Size a_size = A.size(), d_size; + int len = 0, type = A.type(); + + CV_Assert( type == B.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) ); + + switch( flags & (GEMM_1_T|GEMM_2_T) ) + { + case 0: + d_size = Size( B.cols, a_size.height ); + len = B.rows; + CV_Assert( a_size.width == len ); + break; + case 1: + d_size = Size( B.cols, a_size.width ); + len = B.rows; + CV_Assert( a_size.height == len ); + break; + case 2: + d_size = Size( B.rows, a_size.height ); + len = B.cols; + CV_Assert( a_size.width == len ); + break; + case 3: + d_size = Size( B.rows, a_size.width ); + len = B.cols; + CV_Assert( a_size.height == len ); + break; + } + + if( !C.empty() ) + { + CV_Assert( C.type() == type && + (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) || + ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height))); + } + + _matD.create( d_size.height, d_size.width, type ); + Mat D = _matD.getMat(); + if( (flags & GEMM_3_T) != 0 && C.data == D.data ) + { + transpose( C, C ); + flags &= ~GEMM_3_T; + } + + Mat *DProxyPtr = &D, DProxy; + if( D.data == A.data || D.data == B.data ) + { + DProxy = Mat(d_size.height, d_size.width, D.type()); + DProxyPtr = &DProxy; + } + + if( type == CV_32FC1 ) + hal::gemm32f(A.ptr(), A.step, B.ptr(), B.step, static_cast(alpha), + C.ptr(), C.step, static_cast(beta), + DProxyPtr->ptr(), DProxyPtr->step, + a_size.height, a_size.width, DProxyPtr->cols, flags); + else if( type == CV_64FC1 ) + hal::gemm64f(A.ptr(), A.step, B.ptr(), B.step, alpha, + C.ptr(), C.step, beta, + DProxyPtr->ptr(), DProxyPtr->step, + a_size.height, a_size.width, DProxyPtr->cols, flags); + else if( type == CV_32FC2 ) + hal::gemm32fc(A.ptr(), A.step, B.ptr(), B.step, static_cast(alpha), + C.ptr(), C.step, static_cast(beta), + DProxyPtr->ptr(), DProxyPtr->step, + a_size.height, a_size.width, DProxyPtr->cols, flags); + else + { + CV_Assert( type == CV_64FC2 ); + hal::gemm64fc(A.ptr(), A.step, B.ptr(), B.step, alpha, + C.ptr(), C.step, beta, + D.ptr(), D.step, + a_size.height, a_size.width, DProxyPtr->cols, flags); + } + + if(DProxyPtr != &D) + DProxyPtr->copyTo(D); } /****************************************************************************************\ diff --git a/modules/core/src/matrix_decomp.cpp b/modules/core/src/matrix_decomp.cpp index 3fe9eca..dc9c014 100644 --- a/modules/core/src/matrix_decomp.cpp +++ b/modules/core/src/matrix_decomp.cpp @@ -89,8 +89,6 @@ LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps) for( k = 0; k < n; k++ ) b[j*bstep + k] += alpha*b[i*bstep + k]; } - - A[i*astep + i] = -d; } if( b ) @@ -101,7 +99,7 @@ LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps) _Tp s = b[i*bstep + j]; for( k = i+1; k < m; k++ ) s -= A[i*astep + k]*b[k*bstep + j]; - b[i*bstep + j] = s*A[i*astep + i]; + b[i*bstep + j] = s/A[i*astep + i]; } } @@ -111,13 +109,19 @@ LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps) int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n) { - return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10); + int output; + CALL_HAL_RET(LU32f, cv_hal_LU32f, output, A, astep, m, b, bstep, n) + output = LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10); + return output; } int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n) { - return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100); + int output; + CALL_HAL_RET(LU64f, cv_hal_LU64f, output, A, astep, m, b, bstep, n) + output = LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100); + return output; } template static inline bool @@ -193,14 +197,17 @@ CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n) return true; } - bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n) { + bool output; + CALL_HAL_RET(Cholesky32f, cv_hal_Cholesky32f, output, A, astep, m, b, bstep, n) return CholImpl(A, astep, m, b, bstep, n); } bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n) { + bool output; + CALL_HAL_RET(Cholesky64f, cv_hal_Cholesky64f, output, A, astep, m, b, bstep, n) return CholImpl(A, astep, m, b, bstep, n); } diff --git a/modules/imgproc/src/hal_replacement.hpp b/modules/imgproc/src/hal_replacement.hpp index 2b681f6..e42cc44 100644 --- a/modules/imgproc/src/hal_replacement.hpp +++ b/modules/imgproc/src/hal_replacement.hpp @@ -309,4 +309,23 @@ inline int hal_ni_warpPerspectve(int src_type, const uchar *src_data, size_t src #include "custom_hal.hpp" +//! @cond IGNORED +#define CALL_HAL_RET(name, fun, retval, ...) \ + int res = fun(__VA_ARGS__, &retval); \ + if (res == CV_HAL_ERROR_OK) \ + return retval; \ + else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \ + CV_Error_(cv::Error::StsInternal, \ + ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); + + +#define CALL_HAL(name, fun, ...) \ + int res = fun(__VA_ARGS__); \ + if (res == CV_HAL_ERROR_OK) \ + return; \ + else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \ + CV_Error_(cv::Error::StsInternal, \ + ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res)); +//! @endcond + #endif -- 2.7.4