From: Roman Donchenko Date: Tue, 5 Nov 2013 12:38:23 +0000 (+0400) Subject: Merge remote-tracking branch 'origin/2.4' into merge-2.4 X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~3703^2~2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=9c83f6c4fbc4eeeaeb498425a2b28a5df0ca0d97;p=platform%2Fupstream%2Fopencv.git Merge remote-tracking branch 'origin/2.4' into merge-2.4 Conflicts: cmake/OpenCVDetectCUDA.cmake modules/core/include/opencv2/core/version.hpp modules/cudacodec/src/ffmpeg_video_source.cpp modules/gpu/src/video_writer.cpp modules/highgui/test/test_ffmpeg.cpp modules/highgui/test/test_video_io.cpp modules/highgui/test/test_video_pos.cpp modules/ocl/include/opencv2/ocl/ocl.hpp modules/ocl/include/opencv2/ocl/private/util.hpp modules/ocl/src/arithm.cpp modules/ocl/src/blend.cpp modules/ocl/src/canny.cpp modules/ocl/src/cl_operations.cpp modules/ocl/src/filtering.cpp modules/ocl/src/haar.cpp modules/ocl/src/imgproc.cpp modules/ocl/src/kmeans.cpp modules/ocl/src/moments.cpp modules/ocl/src/safe_call.hpp modules/ocl/src/split_merge.cpp modules/ocl/test/test_moments.cpp samples/ocl/squares.cpp --- 9c83f6c4fbc4eeeaeb498425a2b28a5df0ca0d97 diff --cc CMakeLists.txt index 73def95,3978aad..324d069 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@@ -31,7 -28,21 +31,11 @@@ else(NOT CMAKE_TOOLCHAIN_FILE set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory") endif(NOT CMAKE_TOOLCHAIN_FILE) -# -------------------------------------------------------------- -# Top level OpenCV project -# -------------------------------------------------------------- -if(CMAKE_GENERATOR MATCHES Xcode AND XCODE_VERSION VERSION_GREATER 4.3) - cmake_minimum_required(VERSION 2.8.8) -elseif(IOS) - cmake_minimum_required(VERSION 2.8.0) -else() - cmake_minimum_required(VERSION 2.6.3) -endif() + if(POLICY CMP0017) + cmake_policy(SET CMP0017 NEW) + endif() + if(POLICY CMP0022) cmake_policy(SET CMP0022 OLD) endif() diff --cc cmake/OpenCVDetectCUDA.cmake index 173bee3,156d90e..87dc4d1 --- a/cmake/OpenCVDetectCUDA.cmake +++ b/cmake/OpenCVDetectCUDA.cmake @@@ -8,8 -13,24 +8,24 @@@ if(CMAKE_COMPILER_IS_GNUCXX AND NOT APP return() endif() + set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) + + foreach(var INCLUDE LIBRARY PROGRAM) + set(__old_frpm_${var} "${CMAKE_FIND_ROOT_PATH_MODE_${var}}") + endforeach() + + set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) + set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH) + set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER) + -find_package(CUDA 4.2 QUIET) +find_package(CUDA "${MIN_VER_CUDA}" QUIET) + foreach(var INCLUDE LIBRARY PROGRAM) + set(CMAKE_FIND_ROOT_PATH_MODE_${var} "${__old_frpm_${var}}") + endforeach() + + list(REMOVE_AT CMAKE_MODULE_PATH 0) + if(CUDA_FOUND) set(HAVE_CUDA 1) @@@ -21,52 -42,8 +37,11 @@@ set(HAVE_CUBLAS 1) endif() - if(${CUDA_VERSION} VERSION_LESS "5.5") - find_cuda_helper_libs(npp) - else() - # hack for CUDA 5.5 - if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm") - unset(CUDA_TOOLKIT_INCLUDE CACHE) - unset(CUDA_CUDART_LIBRARY CACHE) - unset(CUDA_cublas_LIBRARY CACHE) - unset(CUDA_cufft_LIBRARY CACHE) - unset(CUDA_npp_LIBRARY CACHE) - - if(SOFTFP) - set(cuda_arm_path "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabi") - else() - set(cuda_arm_path "${CUDA_TOOLKIT_ROOT_DIR}/targets/armv7-linux-gnueabihf") - endif() - - set(CUDA_TOOLKIT_INCLUDE "${cuda_arm_path}/include" CACHE PATH "include path") - set(CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) - - set(cuda_arm_library_path "${cuda_arm_path}/lib") - - set(CUDA_CUDART_LIBRARY "${cuda_arm_library_path}/libcudart.so" CACHE FILEPATH "cudart library") - set(CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY}) - set(CUDA_cublas_LIBRARY "${cuda_arm_library_path}/libcublas.so" CACHE FILEPATH "cublas library") - set(CUDA_cufft_LIBRARY "${cuda_arm_library_path}/libcufft.so" CACHE FILEPATH "cufft library") - set(CUDA_nppc_LIBRARY "${cuda_arm_library_path}/libnppc.so" CACHE FILEPATH "nppc library") - set(CUDA_nppi_LIBRARY "${cuda_arm_library_path}/libnppi.so" CACHE FILEPATH "nppi library") - set(CUDA_npps_LIBRARY "${cuda_arm_library_path}/libnpps.so" CACHE FILEPATH "npps library") - set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}" CACHE STRING "npp library") - else() - unset(CUDA_npp_LIBRARY CACHE) - - find_cuda_helper_libs(nppc) - find_cuda_helper_libs(nppi) - find_cuda_helper_libs(npps) - - set(CUDA_npp_LIBRARY "${CUDA_nppc_LIBRARY};${CUDA_nppi_LIBRARY};${CUDA_npps_LIBRARY}" CACHE STRING "npp library") - endif() - endif() - if(WITH_NVCUVID) find_cuda_helper_libs(nvcuvid) + if(WIN32) + find_cuda_helper_libs(nvcuvenc) + endif() set(HAVE_NVCUVID 1) endif() diff --cc modules/highgui/test/test_ffmpeg.cpp index 01afa83,468fe77..85ee0be --- a/modules/highgui/test/test_ffmpeg.cpp +++ b/modules/highgui/test/test_ffmpeg.cpp @@@ -84,64 -84,63 +84,63 @@@ public for (size_t j = 0; j < n; ++j) { - int tag = tags[j]; - stringstream s; - s << tag; + int tag = tags[j]; + stringstream s; + s << tag; - const string filename = "output_"+s.str()+".avi"; + const string filename = "output_"+s.str()+".avi"; - try - { - double fps = fps0; - Size frame_s = Size(img_c, img_r); - - if( tag == VideoWriter::fourcc('H', '2', '6', '1') ) - frame_s = Size(352, 288); - else if( tag == VideoWriter::fourcc('H', '2', '6', '3') ) - frame_s = Size(704, 576); - /*else if( tag == CV_FOURCC('M', 'J', 'P', 'G') || - tag == CV_FOURCC('j', 'p', 'e', 'g') ) - frame_s = Size(1920, 1080);*/ - - if( tag == VideoWriter::fourcc('M', 'P', 'E', 'G') ) + try { - frame_s = Size(720, 576); - fps = 25; - } - - VideoWriter writer(filename, tag, fps, frame_s); + double fps = fps0; + Size frame_s = Size(img_c, img_r); + - if( tag == CV_FOURCC('H', '2', '6', '1') ) ++ if( tag == VideoWriter::fourcc('H', '2', '6', '1') ) + frame_s = Size(352, 288); - else if( tag == CV_FOURCC('H', '2', '6', '3') ) ++ else if( tag == VideoWriter::fourcc('H', '2', '6', '3') ) + frame_s = Size(704, 576); + /*else if( tag == CV_FOURCC('M', 'J', 'P', 'G') || + tag == CV_FOURCC('j', 'p', 'e', 'g') ) + frame_s = Size(1920, 1080);*/ + - if( tag == CV_FOURCC('M', 'P', 'E', 'G') ) ++ if( tag == VideoWriter::fourcc('M', 'P', 'E', 'G') ) + { + frame_s = Size(720, 576); + fps = 25; + } - if (writer.isOpened() == false) - { - ts->printf(ts->LOG, "\n\nFile name: %s\n", filename.c_str()); - ts->printf(ts->LOG, "Codec id: %d Codec tag: %c%c%c%c\n", j, - tag & 255, (tag >> 8) & 255, (tag >> 16) & 255, (tag >> 24) & 255); - ts->printf(ts->LOG, "Error: cannot create video file."); - ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT); - } - else - { - Mat img(frame_s, CV_8UC3, Scalar::all(0)); - const int coeff = cvRound(min(frame_s.width, frame_s.height)/(fps0 * time_sec)); + VideoWriter writer(filename, tag, fps, frame_s); - for (int i = 0 ; i < static_cast(fps * time_sec); i++ ) + if (writer.isOpened() == false) { - //circle(img, Point2i(img_c / 2, img_r / 2), min(img_r, img_c) / 2 * (i + 1), Scalar(255, 0, 0, 0), 2); - rectangle(img, Point2i(coeff * i, coeff * i), Point2i(coeff * (i + 1), coeff * (i + 1)), - Scalar::all(255 * (1.0 - static_cast(i) / (fps * time_sec * 2) )), -1); - writer << img; + ts->printf(ts->LOG, "\n\nFile name: %s\n", filename.c_str()); + ts->printf(ts->LOG, "Codec id: %d Codec tag: %c%c%c%c\n", j, + tag & 255, (tag >> 8) & 255, (tag >> 16) & 255, (tag >> 24) & 255); + ts->printf(ts->LOG, "Error: cannot create video file."); + ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT); + } + else + { + Mat img(frame_s, CV_8UC3, Scalar::all(0)); + const int coeff = cvRound(min(frame_s.width, frame_s.height)/(fps0 * time_sec)); + + for (int i = 0 ; i < static_cast(fps * time_sec); i++ ) + { + //circle(img, Point2i(img_c / 2, img_r / 2), min(img_r, img_c) / 2 * (i + 1), Scalar(255, 0, 0, 0), 2); + rectangle(img, Point2i(coeff * i, coeff * i), Point2i(coeff * (i + 1), coeff * (i + 1)), + Scalar::all(255 * (1.0 - static_cast(i) / (fps * time_sec * 2) )), -1); + writer << img; + } + + if (!created) created = true; + else remove(filename.c_str()); } - - if (!created) created = true; - else remove(filename.c_str()); } - } - catch(...) - { - ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT); - } - ts->set_failed_test_info(cvtest::TS::OK); - + catch(...) + { + ts->set_failed_test_info(ts->FAIL_INVALID_OUTPUT); + } + ts->set_failed_test_info(cvtest::TS::OK); } } }; diff --cc modules/ocl/doc/image_filtering.rst index bf46802,cbec29b..e020dc7 --- a/modules/ocl/doc/image_filtering.rst +++ b/modules/ocl/doc/image_filtering.rst @@@ -459,37 -453,12 +453,41 @@@ Returns voi :param scale: The optional scale factor for the computed Laplacian values (by default, no scaling is applied + :param delta: Optional delta value that is added to the results prior to storing them in ``dst`` . Supported value is 0 only. + + :param bordertype: Pixel extrapolation method. + The function calculates the Laplacian of the source image by adding up the second x and y derivatives calculated using the Sobel operator. +ocl::ConvolveBuf +---------------- +.. ocv:struct:: ocl::ConvolveBuf + +Class providing a memory buffer for :ocv:func:`ocl::convolve` function, plus it allows to adjust some specific parameters. :: + + struct CV_EXPORTS ConvolveBuf + { + Size result_size; + Size block_size; + Size user_block_size; + Size dft_size; + int spect_len; + + oclMat image_spect, templ_spect, result_spect; + oclMat image_block, templ_block, result_data; + + void create(Size image_size, Size templ_size); + static Size estimateBlockSize(Size result_size, Size templ_size); + }; + +You can use field `user_block_size` to set specific block size for :ocv:func:`ocl::convolve` function. If you leave its default value `Size(0,0)` then automatic estimation of block size will be used (which is optimized for speed). By varying `user_block_size` you can reduce memory requirements at the cost of speed. + +ocl::ConvolveBuf::create +------------------------ +.. ocv:function:: ocl::ConvolveBuf::create(Size image_size, Size templ_size) + +Constructs a buffer for :ocv:func:`ocl::convolve` function with respective arguments. + ocl::convolve ------------------ Returns void diff --cc modules/ocl/include/opencv2/ocl.hpp index 3f0fb29,0000000..b8c26b2 mode 100644,000000..100644 --- a/modules/ocl/include/opencv2/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl.hpp @@@ -1,2070 -1,0 +1,2076 @@@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation - // and/or other oclMaterials provided with the distribution. ++// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_OCL_HPP__ +#define __OPENCV_OCL_HPP__ + +#include +#include + +#include "opencv2/core.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/objdetect.hpp" +#include "opencv2/ml.hpp" + +namespace cv +{ + namespace ocl + { + enum DeviceType + { + CVCL_DEVICE_TYPE_DEFAULT = (1 << 0), + CVCL_DEVICE_TYPE_CPU = (1 << 1), + CVCL_DEVICE_TYPE_GPU = (1 << 2), + CVCL_DEVICE_TYPE_ACCELERATOR = (1 << 3), + //CVCL_DEVICE_TYPE_CUSTOM = (1 << 4) + CVCL_DEVICE_TYPE_ALL = 0xFFFFFFFF + }; + + enum DevMemRW + { + DEVICE_MEM_R_W = 0, + DEVICE_MEM_R_ONLY, + DEVICE_MEM_W_ONLY + }; + + enum DevMemType + { + DEVICE_MEM_DEFAULT = 0, + DEVICE_MEM_AHP, //alloc host pointer + DEVICE_MEM_UHP, //use host pointer + DEVICE_MEM_CHP, //copy host pointer + DEVICE_MEM_PM //persistent memory + }; + + // these classes contain OpenCL runtime information + + struct PlatformInfo; + + struct DeviceInfo + { + public: + int _id; // reserved, don't use it + + DeviceType deviceType; + std::string deviceProfile; + std::string deviceVersion; + std::string deviceName; + std::string deviceVendor; + int deviceVendorId; + std::string deviceDriverVersion; + std::string deviceExtensions; + + size_t maxWorkGroupSize; + std::vector maxWorkItemSizes; + int maxComputeUnits; + size_t localMemorySize; + size_t maxMemAllocSize; + + int deviceVersionMajor; + int deviceVersionMinor; + + bool haveDoubleSupport; + bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0 ++ bool isIntelDevice; + + std::string compilationExtraOptions; + + const PlatformInfo* platform; + + DeviceInfo(); + }; + + struct PlatformInfo + { + int _id; // reserved, don't use it + + std::string platformProfile; + std::string platformVersion; + std::string platformName; + std::string platformVendor; + std::string platformExtensons; + + int platformVersionMajor; + int platformVersionMinor; + + std::vector devices; + + PlatformInfo(); + }; + + //////////////////////////////// Initialization & Info //////////////////////// + typedef std::vector PlatformsInfo; + + CV_EXPORTS int getOpenCLPlatforms(PlatformsInfo& platforms); + + typedef std::vector DevicesInfo; + + CV_EXPORTS int getOpenCLDevices(DevicesInfo& devices, int deviceType = CVCL_DEVICE_TYPE_GPU, + const PlatformInfo* platform = NULL); + + // set device you want to use + CV_EXPORTS void setDevice(const DeviceInfo* info); + + enum FEATURE_TYPE + { + FEATURE_CL_DOUBLE = 1, + FEATURE_CL_UNIFIED_MEM, - FEATURE_CL_VER_1_2 ++ FEATURE_CL_VER_1_2, ++ FEATURE_CL_INTEL_DEVICE + }; + + // Represents OpenCL context, interface + class CV_EXPORTS Context + { + protected: + Context() { } + ~Context() { } + public: + static Context *getContext(); + + bool supportsFeature(FEATURE_TYPE featureType) const; + const DeviceInfo& getDeviceInfo() const; + + const void* getOpenCLContextPtr() const; + const void* getOpenCLCommandQueuePtr() const; + const void* getOpenCLDeviceIDPtr() const; + }; + + inline const void *getClContextPtr() + { + return Context::getContext()->getOpenCLContextPtr(); + } + + inline const void *getClCommandQueuePtr() + { + return Context::getContext()->getOpenCLCommandQueuePtr(); + } + + CV_EXPORTS bool supportsFeature(FEATURE_TYPE featureType); + + CV_EXPORTS void finish(); + + enum BINARY_CACHE_MODE + { + CACHE_NONE = 0, // do not cache OpenCL binary + CACHE_DEBUG = 0x1 << 0, // cache OpenCL binary when built in debug mode + CACHE_RELEASE = 0x1 << 1, // default behavior, only cache when built in release mode + CACHE_ALL = CACHE_DEBUG | CACHE_RELEASE, // cache opencl binary + }; + //! Enable or disable OpenCL program binary caching onto local disk + // After a program (*.cl files in opencl/ folder) is built at runtime, we allow the + // compiled OpenCL program to be cached to the path automatically as "path/*.clb" + // binary file, which will be reused when the OpenCV executable is started again. + // + // This feature is enabled by default. + CV_EXPORTS void setBinaryDiskCache(int mode = CACHE_RELEASE, cv::String path = "./"); + + //! set where binary cache to be saved to + CV_EXPORTS void setBinaryPath(const char *path); + + struct ProgramSource + { + const char* name; + const char* programStr; + const char* programHash; + + // Cache in memory by name (should be unique). Caching on disk disabled. + inline ProgramSource(const char* _name, const char* _programStr) + : name(_name), programStr(_programStr), programHash(NULL) + { + } + + // Cache in memory by name (should be unique). Caching on disk uses programHash mark. + inline ProgramSource(const char* _name, const char* _programStr, const char* _programHash) + : name(_name), programStr(_programStr), programHash(_programHash) + { + } + }; + + //! Calls OpenCL kernel. Pass globalThreads = NULL, and cleanUp = true, to finally clean-up without executing. + //! Deprecated, will be replaced + CV_EXPORTS void openCLExecuteKernelInterop(Context *clCxt, + const cv::ocl::ProgramSource& source, String kernelName, + size_t globalThreads[3], size_t localThreads[3], + std::vector< std::pair > &args, + int channels, int depth, const char *build_options); + + class CV_EXPORTS oclMatExpr; + //////////////////////////////// oclMat //////////////////////////////// + class CV_EXPORTS oclMat + { + public: + //! default constructor + oclMat(); + //! constructs oclMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.) + oclMat(int rows, int cols, int type); + oclMat(Size size, int type); + //! constucts oclMatrix and fills it with the specified value _s. + oclMat(int rows, int cols, int type, const Scalar &s); + oclMat(Size size, int type, const Scalar &s); + //! copy constructor + oclMat(const oclMat &m); + + //! constructor for oclMatrix headers pointing to user-allocated data + oclMat(int rows, int cols, int type, void *data, size_t step = Mat::AUTO_STEP); + oclMat(Size size, int type, void *data, size_t step = Mat::AUTO_STEP); + + //! creates a matrix header for a part of the bigger matrix + oclMat(const oclMat &m, const Range &rowRange, const Range &colRange); + oclMat(const oclMat &m, const Rect &roi); + + //! builds oclMat from Mat. Perfom blocking upload to device. + explicit oclMat (const Mat &m); + + //! destructor - calls release() + ~oclMat(); + + //! assignment operators + oclMat &operator = (const oclMat &m); + //! assignment operator. Perfom blocking upload to device. + oclMat &operator = (const Mat &m); + oclMat &operator = (const oclMatExpr& expr); + + //! pefroms blocking upload data to oclMat. + void upload(const cv::Mat &m); + + + //! downloads data from device to host memory. Blocking calls. + operator Mat() const; + void download(cv::Mat &m) const; + + //! convert to _InputArray + operator _InputArray(); + + //! convert to _OutputArray + operator _OutputArray(); + + //! returns a new oclMatrix header for the specified row + oclMat row(int y) const; + //! returns a new oclMatrix header for the specified column + oclMat col(int x) const; + //! ... for the specified row span + oclMat rowRange(int startrow, int endrow) const; + oclMat rowRange(const Range &r) const; + //! ... for the specified column span + oclMat colRange(int startcol, int endcol) const; + oclMat colRange(const Range &r) const; + + //! returns deep copy of the oclMatrix, i.e. the data is copied + oclMat clone() const; + + //! copies those oclMatrix elements to "m" that are marked with non-zero mask elements. + // It calls m.create(this->size(), this->type()). + // It supports any data type + void copyTo( oclMat &m, const oclMat &mask = oclMat()) const; + + //! converts oclMatrix to another datatype with optional scalng. See cvConvertScale. + //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 + void convertTo( oclMat &m, int rtype, double alpha = 1, double beta = 0 ) const; + + void assignTo( oclMat &m, int type = -1 ) const; + + //! sets every oclMatrix element to s + //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 + oclMat& operator = (const Scalar &s); + //! sets some of the oclMatrix elements to s, according to the mask + //It supports 8UC1 8UC4 32SC1 32SC4 32FC1 32FC4 + oclMat& setTo(const Scalar &s, const oclMat &mask = oclMat()); + //! creates alternative oclMatrix header for the same data, with different + // number of channels and/or different number of rows. see cvReshape. + oclMat reshape(int cn, int rows = 0) const; + + //! allocates new oclMatrix data unless the oclMatrix already has specified size and type. + // previous data is unreferenced if needed. + void create(int rows, int cols, int type); + void create(Size size, int type); + + //! allocates new oclMatrix with specified device memory type. + void createEx(int rows, int cols, int type, + DevMemRW rw_type, DevMemType mem_type); + void createEx(Size size, int type, DevMemRW rw_type, + DevMemType mem_type); + + //! decreases reference counter; + // deallocate the data when reference counter reaches 0. + void release(); + + //! swaps with other smart pointer + void swap(oclMat &mat); + + //! locates oclMatrix header within a parent oclMatrix. See below + void locateROI( Size &wholeSize, Point &ofs ) const; + //! moves/resizes the current oclMatrix ROI inside the parent oclMatrix. + oclMat& adjustROI( int dtop, int dbottom, int dleft, int dright ); + //! extracts a rectangular sub-oclMatrix + // (this is a generalized form of row, rowRange etc.) + oclMat operator()( Range rowRange, Range colRange ) const; + oclMat operator()( const Rect &roi ) const; + + oclMat& operator+=( const oclMat& m ); + oclMat& operator-=( const oclMat& m ); + oclMat& operator*=( const oclMat& m ); + oclMat& operator/=( const oclMat& m ); + + //! returns true if the oclMatrix data is continuous + // (i.e. when there are no gaps between successive rows). + // similar to CV_IS_oclMat_CONT(cvoclMat->type) + bool isContinuous() const; + //! returns element size in bytes, + // similar to CV_ELEM_SIZE(cvMat->type) + size_t elemSize() const; + //! returns the size of element channel in bytes. + size_t elemSize1() const; + //! returns element type, similar to CV_MAT_TYPE(cvMat->type) + int type() const; + //! returns element type, i.e. 8UC3 returns 8UC4 because in ocl + //! 3 channels element actually use 4 channel space + int ocltype() const; + //! returns element type, similar to CV_MAT_DEPTH(cvMat->type) + int depth() const; + //! returns element type, similar to CV_MAT_CN(cvMat->type) + int channels() const; + //! returns element type, return 4 for 3 channels element, + //!becuase 3 channels element actually use 4 channel space + int oclchannels() const; + //! returns step/elemSize1() + size_t step1() const; + //! returns oclMatrix size: + // width == number of columns, height == number of rows + Size size() const; + //! returns true if oclMatrix data is NULL + bool empty() const; + + //! returns pointer to y-th row + uchar* ptr(int y = 0); + const uchar *ptr(int y = 0) const; + + //! template version of the above method + template _Tp *ptr(int y = 0); + template const _Tp *ptr(int y = 0) const; + + //! matrix transposition + oclMat t() const; + + /*! includes several bit-fields: + - the magic signature + - continuity flag + - depth + - number of channels + */ + int flags; + //! the number of rows and columns + int rows, cols; + //! a distance between successive rows in bytes; includes the gap if any + size_t step; + //! pointer to the data(OCL memory object) + uchar *data; + + //! pointer to the reference counter; + // when oclMatrix points to user-allocated data, the pointer is NULL + int *refcount; + + //! helper fields used in locateROI and adjustROI + //datastart and dataend are not used in current version + uchar *datastart; + uchar *dataend; + + //! OpenCL context associated with the oclMat object. + Context *clCxt; // TODO clCtx + //add offset for handle ROI, calculated in byte + int offset; + //add wholerows and wholecols for the whole matrix, datastart and dataend are no longer used + int wholerows; + int wholecols; + }; + + // convert InputArray/OutputArray to oclMat references + CV_EXPORTS oclMat& getOclMatRef(InputArray src); + CV_EXPORTS oclMat& getOclMatRef(OutputArray src); + + ///////////////////// mat split and merge ///////////////////////////////// + //! Compose a multi-channel array from several single-channel arrays + // Support all types + CV_EXPORTS void merge(const oclMat *src, size_t n, oclMat &dst); + CV_EXPORTS void merge(const std::vector &src, oclMat &dst); + + //! Divides multi-channel array into several single-channel arrays + // Support all types + CV_EXPORTS void split(const oclMat &src, oclMat *dst); + CV_EXPORTS void split(const oclMat &src, std::vector &dst); + + ////////////////////////////// Arithmetics /////////////////////////////////// + + //! adds one matrix to another with scale (dst = src1 * alpha + src2 * beta + gama) + // supports all data types + CV_EXPORTS void addWeighted(const oclMat &src1, double alpha, const oclMat &src2, double beta, double gama, oclMat &dst); + + //! adds one matrix to another (dst = src1 + src2) + // supports all data types + CV_EXPORTS void add(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat()); + //! adds scalar to a matrix (dst = src1 + s) + // supports all data types + CV_EXPORTS void add(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat()); + + //! subtracts one matrix from another (dst = src1 - src2) + // supports all data types + CV_EXPORTS void subtract(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat()); + //! subtracts scalar from a matrix (dst = src1 - s) + // supports all data types + CV_EXPORTS void subtract(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat()); + + //! computes element-wise product of the two arrays (dst = src1 * scale * src2) + // supports all data types + CV_EXPORTS void multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1); + //! multiplies matrix to a number (dst = scalar * src) + // supports all data types + CV_EXPORTS void multiply(double scalar, const oclMat &src, oclMat &dst); + + //! computes element-wise quotient of the two arrays (dst = src1 * scale / src2) + // supports all data types + CV_EXPORTS void divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scale = 1); + //! computes element-wise quotient of the two arrays (dst = scale / src) + // supports all data types + CV_EXPORTS void divide(double scale, const oclMat &src1, oclMat &dst); + + //! computes element-wise minimum of the two arrays (dst = min(src1, src2)) + // supports all data types + CV_EXPORTS void min(const oclMat &src1, const oclMat &src2, oclMat &dst); + + //! computes element-wise maximum of the two arrays (dst = max(src1, src2)) + // supports all data types + CV_EXPORTS void max(const oclMat &src1, const oclMat &src2, oclMat &dst); + + //! compares elements of two arrays (dst = src1 src2) + // supports all data types + CV_EXPORTS void compare(const oclMat &src1, const oclMat &src2, oclMat &dst, int cmpop); + + //! transposes the matrix + // supports all data types + CV_EXPORTS void transpose(const oclMat &src, oclMat &dst); + + //! computes element-wise absolute values of an array (dst = abs(src)) + // supports all data types + CV_EXPORTS void abs(const oclMat &src, oclMat &dst); + + //! computes element-wise absolute difference of two arrays (dst = abs(src1 - src2)) + // supports all data types + CV_EXPORTS void absdiff(const oclMat &src1, const oclMat &src2, oclMat &dst); + //! computes element-wise absolute difference of array and scalar (dst = abs(src1 - s)) + // supports all data types + CV_EXPORTS void absdiff(const oclMat &src1, const Scalar &s, oclMat &dst); + + //! computes mean value and standard deviation of all or selected array elements + // supports all data types + CV_EXPORTS void meanStdDev(const oclMat &mtx, Scalar &mean, Scalar &stddev); + + //! computes norm of array + // supports NORM_INF, NORM_L1, NORM_L2 + // supports all data types + CV_EXPORTS double norm(const oclMat &src1, int normType = NORM_L2); + + //! computes norm of the difference between two arrays + // supports NORM_INF, NORM_L1, NORM_L2 + // supports all data types + CV_EXPORTS double norm(const oclMat &src1, const oclMat &src2, int normType = NORM_L2); + + //! reverses the order of the rows, columns or both in a matrix + // supports all types + CV_EXPORTS void flip(const oclMat &src, oclMat &dst, int flipCode); + + //! computes sum of array elements + // support all types + CV_EXPORTS Scalar sum(const oclMat &m); + CV_EXPORTS Scalar absSum(const oclMat &m); + CV_EXPORTS Scalar sqrSum(const oclMat &m); + + //! finds global minimum and maximum array elements and returns their values + // support all C1 types + CV_EXPORTS void minMax(const oclMat &src, double *minVal, double *maxVal = 0, const oclMat &mask = oclMat()); + + //! finds global minimum and maximum array elements and returns their values with locations + // support all C1 types + CV_EXPORTS void minMaxLoc(const oclMat &src, double *minVal, double *maxVal = 0, Point *minLoc = 0, Point *maxLoc = 0, + const oclMat &mask = oclMat()); + + //! counts non-zero array elements + // support all types + CV_EXPORTS int countNonZero(const oclMat &src); + + //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) + // destination array will have the depth type as lut and the same channels number as source + //It supports 8UC1 8UC4 only + CV_EXPORTS void LUT(const oclMat &src, const oclMat &lut, oclMat &dst); + + //! only 8UC1 and 256 bins is supported now + CV_EXPORTS void calcHist(const oclMat &mat_src, oclMat &mat_hist); + //! only 8UC1 and 256 bins is supported now + CV_EXPORTS void equalizeHist(const oclMat &mat_src, oclMat &mat_dst); + + //! only 8UC1 is supported now + CV_EXPORTS Ptr createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8)); + + //! bilateralFilter + // supports 8UC1 8UC4 + CV_EXPORTS void bilateralFilter(const oclMat& src, oclMat& dst, int d, double sigmaColor, double sigmaSpace, int borderType=BORDER_DEFAULT); + + //! Applies an adaptive bilateral filter to the input image + // This is not truly a bilateral filter. Instead of using user provided fixed parameters, + // the function calculates a constant at each window based on local standard deviation, + // and use this constant to do filtering. + // supports 8UC1, 8UC3 + CV_EXPORTS void adaptiveBilateralFilter(const oclMat& src, oclMat& dst, Size ksize, double sigmaSpace, Point anchor = Point(-1, -1), int borderType=BORDER_DEFAULT); + + //! computes exponent of each matrix element (dst = e**src) + // supports only CV_32FC1, CV_64FC1 type + CV_EXPORTS void exp(const oclMat &src, oclMat &dst); + + //! computes natural logarithm of absolute value of each matrix element: dst = log(abs(src)) + // supports only CV_32FC1, CV_64FC1 type + CV_EXPORTS void log(const oclMat &src, oclMat &dst); + + //! computes magnitude of each (x(i), y(i)) vector + // supports only CV_32F, CV_64F type + CV_EXPORTS void magnitude(const oclMat &x, const oclMat &y, oclMat &magnitude); + + //! computes angle (angle(i)) of each (x(i), y(i)) vector + // supports only CV_32F, CV_64F type + CV_EXPORTS void phase(const oclMat &x, const oclMat &y, oclMat &angle, bool angleInDegrees = false); + + //! the function raises every element of tne input array to p + // support only CV_32F, CV_64F type + CV_EXPORTS void pow(const oclMat &x, double p, oclMat &y); + + //! converts Cartesian coordinates to polar + // supports only CV_32F CV_64F type + CV_EXPORTS void cartToPolar(const oclMat &x, const oclMat &y, oclMat &magnitude, oclMat &angle, bool angleInDegrees = false); + + //! converts polar coordinates to Cartesian + // supports only CV_32F CV_64F type + CV_EXPORTS void polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &x, oclMat &y, bool angleInDegrees = false); + + //! perfroms per-elements bit-wise inversion + // supports all types + CV_EXPORTS void bitwise_not(const oclMat &src, oclMat &dst); + + //! calculates per-element bit-wise disjunction of two arrays + // supports all types + CV_EXPORTS void bitwise_or(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat()); + CV_EXPORTS void bitwise_or(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat()); + + //! calculates per-element bit-wise conjunction of two arrays + // supports all types + CV_EXPORTS void bitwise_and(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat()); + CV_EXPORTS void bitwise_and(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat()); + + //! calculates per-element bit-wise "exclusive or" operation + // supports all types + CV_EXPORTS void bitwise_xor(const oclMat &src1, const oclMat &src2, oclMat &dst, const oclMat &mask = oclMat()); + CV_EXPORTS void bitwise_xor(const oclMat &src1, const Scalar &s, oclMat &dst, const oclMat &mask = oclMat()); + + //! Logical operators + CV_EXPORTS oclMat operator ~ (const oclMat &); + CV_EXPORTS oclMat operator | (const oclMat &, const oclMat &); + CV_EXPORTS oclMat operator & (const oclMat &, const oclMat &); + CV_EXPORTS oclMat operator ^ (const oclMat &, const oclMat &); + + + //! Mathematics operators + CV_EXPORTS oclMatExpr operator + (const oclMat &src1, const oclMat &src2); + CV_EXPORTS oclMatExpr operator - (const oclMat &src1, const oclMat &src2); + CV_EXPORTS oclMatExpr operator * (const oclMat &src1, const oclMat &src2); + CV_EXPORTS oclMatExpr operator / (const oclMat &src1, const oclMat &src2); + + struct CV_EXPORTS ConvolveBuf + { + Size result_size; + Size block_size; + Size user_block_size; + Size dft_size; + + oclMat image_spect, templ_spect, result_spect; + oclMat image_block, templ_block, result_data; + + void create(Size image_size, Size templ_size); + static Size estimateBlockSize(Size result_size, Size templ_size); + }; + + //! computes convolution of two images, may use discrete Fourier transform + // support only CV_32FC1 type + CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr = false); + CV_EXPORTS void convolve(const oclMat &image, const oclMat &temp1, oclMat &result, bool ccorr, ConvolveBuf& buf); + + //! Performs a per-element multiplication of two Fourier spectrums. + //! Only full (not packed) CV_32FC2 complex spectrums in the interleaved format are supported for now. + //! support only CV_32FC2 type + CV_EXPORTS void mulSpectrums(const oclMat &a, const oclMat &b, oclMat &c, int flags, float scale, bool conjB = false); + + CV_EXPORTS void cvtColor(const oclMat &src, oclMat &dst, int code, int dcn = 0); + + //! initializes a scaled identity matrix + CV_EXPORTS void setIdentity(oclMat& src, const Scalar & val = Scalar(1)); + + //////////////////////////////// Filter Engine //////////////////////////////// + + /*! + The Base Class for 1D or Row-wise Filters + + This is the base class for linear or non-linear filters that process 1D data. + In particular, such filters are used for the "horizontal" filtering parts in separable filters. + */ + class CV_EXPORTS BaseRowFilter_GPU + { + public: + BaseRowFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {} + virtual ~BaseRowFilter_GPU() {} + virtual void operator()(const oclMat &src, oclMat &dst) = 0; + int ksize, anchor, bordertype; + }; + + /*! + The Base Class for Column-wise Filters + + This is the base class for linear or non-linear filters that process columns of 2D arrays. + Such filters are used for the "vertical" filtering parts in separable filters. + */ + class CV_EXPORTS BaseColumnFilter_GPU + { + public: + BaseColumnFilter_GPU(int ksize_, int anchor_, int bordertype_) : ksize(ksize_), anchor(anchor_), bordertype(bordertype_) {} + virtual ~BaseColumnFilter_GPU() {} + virtual void operator()(const oclMat &src, oclMat &dst) = 0; + int ksize, anchor, bordertype; + }; + + /*! + The Base Class for Non-Separable 2D Filters. + + This is the base class for linear or non-linear 2D filters. + */ + class CV_EXPORTS BaseFilter_GPU + { + public: + BaseFilter_GPU(const Size &ksize_, const Point &anchor_, const int &borderType_) + : ksize(ksize_), anchor(anchor_), borderType(borderType_) {} + virtual ~BaseFilter_GPU() {} + virtual void operator()(const oclMat &src, oclMat &dst) = 0; + Size ksize; + Point anchor; + int borderType; + }; + + /*! + The Base Class for Filter Engine. + + The class can be used to apply an arbitrary filtering operation to an image. + It contains all the necessary intermediate buffers. + */ + class CV_EXPORTS FilterEngine_GPU + { + public: + virtual ~FilterEngine_GPU() {} + + virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) = 0; + }; + + //! returns the non-separable filter engine with the specified filter + CV_EXPORTS Ptr createFilter2D_GPU(const Ptr filter2D); + + //! returns the primitive row filter with the specified kernel + CV_EXPORTS Ptr getLinearRowFilter_GPU(int srcType, int bufType, const Mat &rowKernel, + int anchor = -1, int bordertype = BORDER_DEFAULT); + + //! returns the primitive column filter with the specified kernel + CV_EXPORTS Ptr getLinearColumnFilter_GPU(int bufType, int dstType, const Mat &columnKernel, + int anchor = -1, int bordertype = BORDER_DEFAULT, double delta = 0.0); + + //! returns the separable linear filter engine + CV_EXPORTS Ptr createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, + const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT); + + //! returns the separable filter engine with the specified filters + CV_EXPORTS Ptr createSeparableFilter_GPU(const Ptr &rowFilter, + const Ptr &columnFilter); + + //! returns the Gaussian filter engine + CV_EXPORTS Ptr createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT); + + //! returns filter engine for the generalized Sobel operator + CV_EXPORTS Ptr createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT ); + + //! applies Laplacian operator to the image - // supports only ksize = 1 and ksize = 3 8UC1 8UC4 32FC1 32FC4 data type - CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1); ++ // supports only ksize = 1 and ksize = 3 ++ CV_EXPORTS void Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize = 1, double scale = 1, ++ double delta=0, int borderType=BORDER_DEFAULT); + + //! returns 2D box filter - // supports CV_8UC1 and CV_8UC4 source type, dst type must be the same as source type ++ // dst type must be the same as source type + CV_EXPORTS Ptr getBoxFilter_GPU(int srcType, int dstType, + const Size &ksize, Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT); + + //! returns box filter engine + CV_EXPORTS Ptr createBoxFilter_GPU(int srcType, int dstType, const Size &ksize, + const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT); + + //! returns 2D filter with the specified kernel - // supports CV_8UC1 and CV_8UC4 types ++ // supports: dst type must be the same as source type + CV_EXPORTS Ptr getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize, + const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT); + + //! returns the non-separable linear filter engine ++ // supports: dst type must be the same as source type + CV_EXPORTS Ptr createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, + const Point &anchor = Point(-1, -1), int borderType = BORDER_DEFAULT); + + //! smooths the image using the normalized box filter - // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 - // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101,BORDER_WRAP + CV_EXPORTS void boxFilter(const oclMat &src, oclMat &dst, int ddepth, Size ksize, + Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT); + + //! returns 2D morphological filter + //! only MORPH_ERODE and MORPH_DILATE are supported + // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types + // kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height + CV_EXPORTS Ptr getMorphologyFilter_GPU(int op, int type, const Mat &kernel, const Size &ksize, + Point anchor = Point(-1, -1)); + + //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported. + CV_EXPORTS Ptr createMorphologyFilter_GPU(int op, int type, const Mat &kernel, + const Point &anchor = Point(-1, -1), int iterations = 1); + + //! a synonym for normalized box filter - // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 - // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101 + static inline void blur(const oclMat &src, oclMat &dst, Size ksize, Point anchor = Point(-1, -1), + int borderType = BORDER_CONSTANT) + { + boxFilter(src, dst, -1, ksize, anchor, borderType); + } + + //! applies non-separable 2D linear filter to the image - // Note, at the moment this function only works when anchor point is in the kernel center - // and kernel size supported is either 3x3 or 5x5; otherwise the function will fail to output valid result + CV_EXPORTS void filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernel, - Point anchor = Point(-1, -1), int borderType = BORDER_DEFAULT); ++ Point anchor = Point(-1, -1), double delta = 0.0, int borderType = BORDER_DEFAULT); + + //! applies separable 2D linear filter to the image + CV_EXPORTS void sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, + Point anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT); + + //! applies generalized Sobel operator to the image + // dst.type must equalize src.type + // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 + // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101 + CV_EXPORTS void Sobel(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT); + + //! applies the vertical or horizontal Scharr operator to the image + // dst.type must equalize src.type + // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 + // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101 + CV_EXPORTS void Scharr(const oclMat &src, oclMat &dst, int ddepth, int dx, int dy, double scale = 1, double delta = 0.0, int bordertype = BORDER_DEFAULT); + + //! smooths the image using Gaussian filter. + // dst.type must equalize src.type + // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 + // supports border type: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT,BORDER_REFLECT_101 + CV_EXPORTS void GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT); + + //! erodes the image (applies the local minimum operator) + // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 + CV_EXPORTS void erode( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, + + int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue()); + + + //! dilates the image (applies the local maximum operator) + // supports data type: CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 + CV_EXPORTS void dilate( const oclMat &src, oclMat &dst, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, + + int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue()); + + + //! applies an advanced morphological operation to the image + CV_EXPORTS void morphologyEx( const oclMat &src, oclMat &dst, int op, const Mat &kernel, Point anchor = Point(-1, -1), int iterations = 1, + + int borderType = BORDER_CONSTANT, const Scalar &borderValue = morphologyDefaultBorderValue()); + + + ////////////////////////////// Image processing ////////////////////////////// + //! Does mean shift filtering on GPU. + CV_EXPORTS void meanShiftFiltering(const oclMat &src, oclMat &dst, int sp, int sr, + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); + + //! Does mean shift procedure on GPU. + CV_EXPORTS void meanShiftProc(const oclMat &src, oclMat &dstr, oclMat &dstsp, int sp, int sr, + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); + + //! Does mean shift segmentation with elimiation of small regions. + CV_EXPORTS void meanShiftSegmentation(const oclMat &src, Mat &dst, int sp, int sr, int minsize, + TermCriteria criteria = TermCriteria(TermCriteria::MAX_ITER + TermCriteria::EPS, 5, 1)); + + //! applies fixed threshold to the image. + // supports CV_8UC1 and CV_32FC1 data type + // supports threshold type: THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO, THRESH_TOZERO_INV + CV_EXPORTS double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type = THRESH_TRUNC); + + //! resizes the image + // Supports INTER_NEAREST, INTER_LINEAR + // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types + CV_EXPORTS void resize(const oclMat &src, oclMat &dst, Size dsize, double fx = 0, double fy = 0, int interpolation = INTER_LINEAR); + + //! Applies a generic geometrical transformation to an image. + + // Supports INTER_NEAREST, INTER_LINEAR. + // Map1 supports CV_16SC2, CV_32FC2 types. + // Src supports CV_8UC1, CV_8UC2, CV_8UC4. + CV_EXPORTS void remap(const oclMat &src, oclMat &dst, oclMat &map1, oclMat &map2, int interpolation, int bordertype, const Scalar &value = Scalar()); + + //! copies 2D array to a larger destination array and pads borders with user-specifiable constant + // supports CV_8UC1, CV_8UC4, CV_32SC1 types + CV_EXPORTS void copyMakeBorder(const oclMat &src, oclMat &dst, int top, int bottom, int left, int right, int boardtype, const Scalar &value = Scalar()); + + //! Smoothes image using median filter + // The source 1- or 4-channel image. m should be 3 or 5, the image depth should be CV_8U or CV_32F. + CV_EXPORTS void medianFilter(const oclMat &src, oclMat &dst, int m); + + //! warps the image using affine transformation + // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC + // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types + CV_EXPORTS void warpAffine(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR); + + //! warps the image using perspective transformation + // Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC + // supports CV_8UC1, CV_8UC4, CV_32FC1 and CV_32FC4 types + CV_EXPORTS void warpPerspective(const oclMat &src, oclMat &dst, const Mat &M, Size dsize, int flags = INTER_LINEAR); + + //! computes the integral image and integral for the squared image + // sum will have CV_32S type, sqsum - CV32F type + // supports only CV_8UC1 source type + CV_EXPORTS void integral(const oclMat &src, oclMat &sum, oclMat &sqsum); + CV_EXPORTS void integral(const oclMat &src, oclMat &sum); + CV_EXPORTS void cornerHarris(const oclMat &src, oclMat &dst, int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT); + CV_EXPORTS void cornerHarris_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy, + int blockSize, int ksize, double k, int bordertype = cv::BORDER_DEFAULT); + CV_EXPORTS void cornerMinEigenVal(const oclMat &src, oclMat &dst, int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT); + CV_EXPORTS void cornerMinEigenVal_dxdy(const oclMat &src, oclMat &dst, oclMat &Dx, oclMat &Dy, + int blockSize, int ksize, int bordertype = cv::BORDER_DEFAULT); + + + /////////////////////////////////// ML /////////////////////////////////////////// + + //! Compute closest centers for each lines in source and lable it after center's index + // supports CV_32FC1/CV_32FC2/CV_32FC4 data type - CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers); ++ // supports NORM_L1 and NORM_L2 distType ++ // if indices is provided, only the indexed rows will be calculated and their results are in the same ++ // order of indices ++ CV_EXPORTS void distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers, int distType = NORM_L2SQR, const oclMat &indices = oclMat()); + + //!Does k-means procedure on GPU + // supports CV_32FC1/CV_32FC2/CV_32FC4 data type + CV_EXPORTS double kmeans(const oclMat &src, int K, oclMat &bestLabels, + TermCriteria criteria, int attemps, int flags, oclMat ¢ers); + + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////CascadeClassifier////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + class CV_EXPORTS OclCascadeClassifier : public cv::CascadeClassifier + { + public: + void detectMultiScale(oclMat &image, CV_OUT std::vector& faces, + double scaleFactor = 1.1, int minNeighbors = 3, int flags = 0, + Size minSize = Size(), Size maxSize = Size()); + }; + + /////////////////////////////// Pyramid ///////////////////////////////////// + CV_EXPORTS void pyrDown(const oclMat &src, oclMat &dst); + + //! upsamples the source image and then smoothes it + CV_EXPORTS void pyrUp(const oclMat &src, oclMat &dst); + + //! performs linear blending of two images + //! to avoid accuracy errors sum of weigths shouldn't be very close to zero + // supports only CV_8UC1 source type + CV_EXPORTS void blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2, oclMat &result); + + //! computes vertical sum, supports only CV_32FC1 images + CV_EXPORTS void columnSum(const oclMat &src, oclMat &sum); + + ///////////////////////////////////////// match_template ///////////////////////////////////////////////////////////// + struct CV_EXPORTS MatchTemplateBuf + { + Size user_block_size; + oclMat imagef, templf; + std::vector images; + std::vector image_sums; + std::vector image_sqsums; + }; + + //! computes the proximity map for the raster template and the image where the template is searched for + // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4 + // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4 + CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method); + + //! computes the proximity map for the raster template and the image where the template is searched for + // Supports TM_SQDIFF, TM_SQDIFF_NORMED, TM_CCORR, TM_CCORR_NORMED, TM_CCOEFF, TM_CCOEFF_NORMED for type 8UC1 and 8UC4 + // Supports TM_SQDIFF, TM_CCORR for type 32FC1 and 32FC4 + CV_EXPORTS void matchTemplate(const oclMat &image, const oclMat &templ, oclMat &result, int method, MatchTemplateBuf &buf); + + + + ///////////////////////////////////////////// Canny ///////////////////////////////////////////// + struct CV_EXPORTS CannyBuf; + + //! compute edges of the input image using Canny operator + // Support CV_8UC1 only + CV_EXPORTS void Canny(const oclMat &image, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); + CV_EXPORTS void Canny(const oclMat &image, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, int apperture_size = 3, bool L2gradient = false); + CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false); + CV_EXPORTS void Canny(const oclMat &dx, const oclMat &dy, CannyBuf &buf, oclMat &edges, double low_thresh, double high_thresh, bool L2gradient = false); + + struct CV_EXPORTS CannyBuf + { - CannyBuf() : counter(NULL) {} ++ CannyBuf() : counter(1, 1, CV_32S) { } + ~CannyBuf() + { + release(); + } - explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(NULL) ++ explicit CannyBuf(const Size &image_size, int apperture_size = 3) : counter(1, 1, CV_32S) + { + create(image_size, apperture_size); + } + CannyBuf(const oclMat &dx_, const oclMat &dy_); + void create(const Size &image_size, int apperture_size = 3); + void release(); + + oclMat dx, dy; + oclMat dx_buf, dy_buf; + oclMat magBuf, mapBuf; + oclMat trackBuf1, trackBuf2; - void *counter; ++ oclMat counter; + Ptr filterDX, filterDY; + }; + + ///////////////////////////////////////// Hough Transform ///////////////////////////////////////// + //! HoughCircles + struct HoughCirclesBuf + { + oclMat edges; + oclMat accum; + oclMat srcPoints; + oclMat centers; + CannyBuf cannyBuf; + }; + + CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096); + CV_EXPORTS void HoughCircles(const oclMat& src, oclMat& circles, HoughCirclesBuf& buf, int method, float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles = 4096); + CV_EXPORTS void HoughCirclesDownload(const oclMat& d_circles, OutputArray h_circles); + + + ///////////////////////////////////////// clAmdFft related ///////////////////////////////////////// + //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix. + //! Param dft_size is the size of DFT transform. + //! + //! For complex-to-real transform it is assumed that the source matrix is packed in CLFFT's format. + // support src type of CV32FC1, CV32FC2 + // support flags: DFT_INVERSE, DFT_REAL_OUTPUT, DFT_COMPLEX_OUTPUT, DFT_ROWS + // dft_size is the size of original input, which is used for transformation from complex to real. + // dft_size must be powers of 2, 3 and 5 + // real to complex dft requires at least v1.8 clAmdFft + // real to complex dft output is not the same with cpu version + // real to complex and complex to real does not support DFT_ROWS + CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(), int flags = 0); + + //! implements generalized matrix product algorithm GEMM from BLAS + // The functionality requires clAmdBlas library + // only support type CV_32FC1 + // flag GEMM_3_T is not supported + CV_EXPORTS void gemm(const oclMat &src1, const oclMat &src2, double alpha, + const oclMat &src3, double beta, oclMat &dst, int flags = 0); + + //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector ////////////// + + struct CV_EXPORTS HOGDescriptor + + { + + enum { DEFAULT_WIN_SIGMA = -1 }; + + enum { DEFAULT_NLEVELS = 64 }; + + enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL }; + + + + HOGDescriptor(Size win_size = Size(64, 128), Size block_size = Size(16, 16), + + Size block_stride = Size(8, 8), Size cell_size = Size(8, 8), + + int nbins = 9, double win_sigma = DEFAULT_WIN_SIGMA, + + double threshold_L2hys = 0.2, bool gamma_correction = true, + + int nlevels = DEFAULT_NLEVELS); + + + + size_t getDescriptorSize() const; + + size_t getBlockHistogramSize() const; + + + + void setSVMDetector(const std::vector &detector); + + + + static std::vector getDefaultPeopleDetector(); + + static std::vector getPeopleDetector48x96(); + + static std::vector getPeopleDetector64x128(); + + + + void detect(const oclMat &img, std::vector &found_locations, + + double hit_threshold = 0, Size win_stride = Size(), + + Size padding = Size()); + + + + void detectMultiScale(const oclMat &img, std::vector &found_locations, + + double hit_threshold = 0, Size win_stride = Size(), + + Size padding = Size(), double scale0 = 1.05, + + int group_threshold = 2); + + + + void getDescriptors(const oclMat &img, Size win_stride, + + oclMat &descriptors, + + int descr_format = DESCR_FORMAT_COL_BY_COL); + + + + Size win_size; + + Size block_size; + + Size block_stride; + + Size cell_size; + + int nbins; + + double win_sigma; + + double threshold_L2hys; + + bool gamma_correction; + + int nlevels; + + + + protected: + + // initialize buffers; only need to do once in case of multiscale detection + + void init_buffer(const oclMat &img, Size win_stride); + + + + void computeBlockHistograms(const oclMat &img); + + void computeGradient(const oclMat &img, oclMat &grad, oclMat &qangle); + + + + double getWinSigma() const; + + bool checkDetectorSize() const; + + + + static int numPartsWithin(int size, int part_size, int stride); + + static Size numPartsWithin(Size size, Size part_size, Size stride); + + + + // Coefficients of the separating plane + + float free_coef; + + oclMat detector; + + + + // Results of the last classification step + + oclMat labels; + + Mat labels_host; + + + + // Results of the last histogram evaluation step + + oclMat block_hists; + + + + // Gradients conputation results + + oclMat grad, qangle; + + + + // scaled image + + oclMat image_scale; + + + + // effect size of input image (might be different from original size after scaling) + + Size effect_size; + + }; + + + ////////////////////////feature2d_ocl///////////////// + /****************************************************************************************\ + * Distance * + \****************************************************************************************/ + template + struct CV_EXPORTS Accumulator + { + typedef T Type; + }; + template<> struct Accumulator + { + typedef float Type; + }; + template<> struct Accumulator + { + typedef float Type; + }; + template<> struct Accumulator + { + typedef float Type; + }; + template<> struct Accumulator + { + typedef float Type; + }; + + /* + * Manhattan distance (city block distance) functor + */ + template + struct CV_EXPORTS L1 + { + enum { normType = NORM_L1 }; + typedef T ValueType; + typedef typename Accumulator::Type ResultType; + + ResultType operator()( const T *a, const T *b, int size ) const + { + return normL1(a, b, size); + } + }; + + /* + * Euclidean distance functor + */ + template + struct CV_EXPORTS L2 + { + enum { normType = NORM_L2 }; + typedef T ValueType; + typedef typename Accumulator::Type ResultType; + + ResultType operator()( const T *a, const T *b, int size ) const + { + return (ResultType)std::sqrt((double)normL2Sqr(a, b, size)); + } + }; + + /* + * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor + * bit count of A exclusive XOR'ed with B + */ + struct CV_EXPORTS Hamming + { + enum { normType = NORM_HAMMING }; + typedef unsigned char ValueType; + typedef int ResultType; + + /** this will count the bits in a ^ b + */ + ResultType operator()( const unsigned char *a, const unsigned char *b, int size ) const + { + return normHamming(a, b, size); + } + }; + + ////////////////////////////////// BruteForceMatcher ////////////////////////////////// + + class CV_EXPORTS BruteForceMatcher_OCL_base + { + public: + enum DistType {L1Dist = 0, L2Dist, HammingDist}; + explicit BruteForceMatcher_OCL_base(DistType distType = L2Dist); + + // Add descriptors to train descriptor collection + void add(const std::vector &descCollection); + + // Get train descriptors collection + const std::vector &getTrainDescriptors() const; + + // Clear train descriptors collection + void clear(); + + // Return true if there are not train descriptors in collection + bool empty() const; + + // Return true if the matcher supports mask in match methods + bool isMaskSupported() const; + + // Find one best match for each query descriptor + void matchSingle(const oclMat &query, const oclMat &train, + oclMat &trainIdx, oclMat &distance, + const oclMat &mask = oclMat()); + + // Download trainIdx and distance and convert it to CPU vector with DMatch + static void matchDownload(const oclMat &trainIdx, const oclMat &distance, std::vector &matches); + // Convert trainIdx and distance to vector with DMatch + static void matchConvert(const Mat &trainIdx, const Mat &distance, std::vector &matches); + + // Find one best match for each query descriptor + void match(const oclMat &query, const oclMat &train, std::vector &matches, const oclMat &mask = oclMat()); + + // Make gpu collection of trains and masks in suitable format for matchCollection function + void makeGpuCollection(oclMat &trainCollection, oclMat &maskCollection, const std::vector &masks = std::vector()); + + // Find one best match from train collection for each query descriptor + void matchCollection(const oclMat &query, const oclMat &trainCollection, + oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, + const oclMat &masks = oclMat()); + + // Download trainIdx, imgIdx and distance and convert it to vector with DMatch + static void matchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, std::vector &matches); + // Convert trainIdx, imgIdx and distance to vector with DMatch + static void matchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, std::vector &matches); + + // Find one best match from train collection for each query descriptor. + void match(const oclMat &query, std::vector &matches, const std::vector &masks = std::vector()); + + // Find k best matches for each query descriptor (in increasing order of distances) + void knnMatchSingle(const oclMat &query, const oclMat &train, + oclMat &trainIdx, oclMat &distance, oclMat &allDist, int k, + const oclMat &mask = oclMat()); + + // Download trainIdx and distance and convert it to vector with DMatch + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void knnMatchDownload(const oclMat &trainIdx, const oclMat &distance, + std::vector< std::vector > &matches, bool compactResult = false); + // Convert trainIdx and distance to vector with DMatch + static void knnMatchConvert(const Mat &trainIdx, const Mat &distance, + std::vector< std::vector > &matches, bool compactResult = false); + + // Find k best matches for each query descriptor (in increasing order of distances). + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + void knnMatch(const oclMat &query, const oclMat &train, + std::vector< std::vector > &matches, int k, const oclMat &mask = oclMat(), + bool compactResult = false); + + // Find k best matches from train collection for each query descriptor (in increasing order of distances) + void knnMatch2Collection(const oclMat &query, const oclMat &trainCollection, + oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, + const oclMat &maskCollection = oclMat()); + + // Download trainIdx and distance and convert it to vector with DMatch + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void knnMatch2Download(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, + std::vector< std::vector > &matches, bool compactResult = false); + // Convert trainIdx and distance to vector with DMatch + static void knnMatch2Convert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, + std::vector< std::vector > &matches, bool compactResult = false); + + // Find k best matches for each query descriptor (in increasing order of distances). + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + void knnMatch(const oclMat &query, std::vector< std::vector > &matches, int k, + const std::vector &masks = std::vector(), bool compactResult = false); + + // Find best matches for each query descriptor which have distance less than maxDistance. + // nMatches.at(0, queryIdx) will contain matches count for queryIdx. + // carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches, + // because it didn't have enough memory. + // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10), + // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches + // Matches doesn't sorted. + void radiusMatchSingle(const oclMat &query, const oclMat &train, + oclMat &trainIdx, oclMat &distance, oclMat &nMatches, float maxDistance, + const oclMat &mask = oclMat()); + + // Download trainIdx, nMatches and distance and convert it to vector with DMatch. + // matches will be sorted in increasing order of distances. + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &distance, const oclMat &nMatches, + std::vector< std::vector > &matches, bool compactResult = false); + // Convert trainIdx, nMatches and distance to vector with DMatch. + static void radiusMatchConvert(const Mat &trainIdx, const Mat &distance, const Mat &nMatches, + std::vector< std::vector > &matches, bool compactResult = false); + + // Find best matches for each query descriptor which have distance less than maxDistance + // in increasing order of distances). + void radiusMatch(const oclMat &query, const oclMat &train, + std::vector< std::vector > &matches, float maxDistance, + const oclMat &mask = oclMat(), bool compactResult = false); + + // Find best matches for each query descriptor which have distance less than maxDistance. + // If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10), + // otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches + // Matches doesn't sorted. + void radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, oclMat &nMatches, float maxDistance, + const std::vector &masks = std::vector()); + + // Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch. + // matches will be sorted in increasing order of distances. + // compactResult is used when mask is not empty. If compactResult is false matches + // vector will have the same size as queryDescriptors rows. If compactResult is true + // matches vector will not contain matches for fully masked out query descriptors. + static void radiusMatchDownload(const oclMat &trainIdx, const oclMat &imgIdx, const oclMat &distance, const oclMat &nMatches, + std::vector< std::vector > &matches, bool compactResult = false); + // Convert trainIdx, nMatches and distance to vector with DMatch. + static void radiusMatchConvert(const Mat &trainIdx, const Mat &imgIdx, const Mat &distance, const Mat &nMatches, + std::vector< std::vector > &matches, bool compactResult = false); + + // Find best matches from train collection for each query descriptor which have distance less than + // maxDistance (in increasing order of distances). + void radiusMatch(const oclMat &query, std::vector< std::vector > &matches, float maxDistance, + const std::vector &masks = std::vector(), bool compactResult = false); + + DistType distType; + + private: + std::vector trainDescCollection; + }; + + template + class CV_EXPORTS BruteForceMatcher_OCL; + + template + class CV_EXPORTS BruteForceMatcher_OCL< L1 > : public BruteForceMatcher_OCL_base + { + public: + explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L1Dist) {} + explicit BruteForceMatcher_OCL(L1 /*d*/) : BruteForceMatcher_OCL_base(L1Dist) {} + }; + template + class CV_EXPORTS BruteForceMatcher_OCL< L2 > : public BruteForceMatcher_OCL_base + { + public: + explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(L2Dist) {} + explicit BruteForceMatcher_OCL(L2 /*d*/) : BruteForceMatcher_OCL_base(L2Dist) {} + }; + template <> class CV_EXPORTS BruteForceMatcher_OCL< Hamming > : public BruteForceMatcher_OCL_base + { + public: + explicit BruteForceMatcher_OCL() : BruteForceMatcher_OCL_base(HammingDist) {} + explicit BruteForceMatcher_OCL(Hamming /*d*/) : BruteForceMatcher_OCL_base(HammingDist) {} + }; + + class CV_EXPORTS BFMatcher_OCL : public BruteForceMatcher_OCL_base + { + public: + explicit BFMatcher_OCL(int norm = NORM_L2) : BruteForceMatcher_OCL_base(norm == NORM_L1 ? L1Dist : norm == NORM_L2 ? L2Dist : HammingDist) {} + }; + + class CV_EXPORTS GoodFeaturesToTrackDetector_OCL + { + public: + explicit GoodFeaturesToTrackDetector_OCL(int maxCorners = 1000, double qualityLevel = 0.01, double minDistance = 0.0, + int blockSize = 3, bool useHarrisDetector = false, double harrisK = 0.04); + + //! return 1 rows matrix with CV_32FC2 type + void operator ()(const oclMat& image, oclMat& corners, const oclMat& mask = oclMat()); + //! download points of type Point2f to a vector. the vector's content will be erased + void downloadPoints(const oclMat &points, std::vector &points_v); + + int maxCorners; + double qualityLevel; + double minDistance; + + int blockSize; + bool useHarrisDetector; + double harrisK; + void releaseMemory() + { + Dx_.release(); + Dy_.release(); + eig_.release(); + minMaxbuf_.release(); + tmpCorners_.release(); + } + private: + oclMat Dx_; + oclMat Dy_; + oclMat eig_; + oclMat minMaxbuf_; + oclMat tmpCorners_; + }; + + inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_, + int blockSize_, bool useHarrisDetector_, double harrisK_) + { + maxCorners = maxCorners_; + qualityLevel = qualityLevel_; + minDistance = minDistance_; + blockSize = blockSize_; + useHarrisDetector = useHarrisDetector_; + harrisK = harrisK_; + } + + /////////////////////////////// PyrLKOpticalFlow ///////////////////////////////////// + + class CV_EXPORTS PyrLKOpticalFlow + { + public: + PyrLKOpticalFlow() + { + winSize = Size(21, 21); + maxLevel = 3; + iters = 30; + derivLambda = 0.5; + useInitialFlow = false; + minEigThreshold = 1e-4f; + getMinEigenVals = false; + isDeviceArch11_ = false; + } + + void sparse(const oclMat &prevImg, const oclMat &nextImg, const oclMat &prevPts, oclMat &nextPts, + oclMat &status, oclMat *err = 0); + + void dense(const oclMat &prevImg, const oclMat &nextImg, oclMat &u, oclMat &v, oclMat *err = 0); + + Size winSize; + int maxLevel; + int iters; + double derivLambda; + bool useInitialFlow; + float minEigThreshold; + bool getMinEigenVals; + + void releaseMemory() + { + dx_calcBuf_.release(); + dy_calcBuf_.release(); + + prevPyr_.clear(); + nextPyr_.clear(); + + dx_buf_.release(); + dy_buf_.release(); + } + + private: + void calcSharrDeriv(const oclMat &src, oclMat &dx, oclMat &dy); + + void buildImagePyramid(const oclMat &img0, std::vector &pyr, bool withBorder); + + oclMat dx_calcBuf_; + oclMat dy_calcBuf_; + + std::vector prevPyr_; + std::vector nextPyr_; + + oclMat dx_buf_; + oclMat dy_buf_; + + oclMat uPyr_[2]; + oclMat vPyr_[2]; + + bool isDeviceArch11_; + }; + + class CV_EXPORTS FarnebackOpticalFlow + { + public: + FarnebackOpticalFlow(); + + int numLevels; + double pyrScale; + bool fastPyramids; + int winSize; + int numIters; + int polyN; + double polySigma; + int flags; + + void operator ()(const oclMat &frame0, const oclMat &frame1, oclMat &flowx, oclMat &flowy); + + void releaseMemory(); + + private: + void prepareGaussian( + int n, double sigma, float *g, float *xg, float *xxg, + double &ig11, double &ig03, double &ig33, double &ig55); + + void setPolynomialExpansionConsts(int n, double sigma); + + void updateFlow_boxFilter( + const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat &flowy, + oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices); + + void updateFlow_gaussianBlur( + const oclMat& R0, const oclMat& R1, oclMat& flowx, oclMat& flowy, + oclMat& M, oclMat &bufM, int blockSize, bool updateMatrices); + + oclMat frames_[2]; + oclMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2]; + std::vector pyramid0_, pyramid1_; + }; + + //////////////// build warping maps //////////////////// + //! builds plane warping maps + CV_EXPORTS void buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, const Mat &T, float scale, oclMat &map_x, oclMat &map_y); + //! builds cylindrical warping maps + CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y); + //! builds spherical warping maps + CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat &R, float scale, oclMat &map_x, oclMat &map_y); + //! builds Affine warping maps + CV_EXPORTS void buildWarpAffineMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap); + + //! builds Perspective warping maps + CV_EXPORTS void buildWarpPerspectiveMaps(const Mat &M, bool inverse, Size dsize, oclMat &xmap, oclMat &ymap); + + ///////////////////////////////////// interpolate frames ////////////////////////////////////////////// + //! Interpolate frames (images) using provided optical flow (displacement field). + //! frame0 - frame 0 (32-bit floating point images, single channel) + //! frame1 - frame 1 (the same type and size) + //! fu - forward horizontal displacement + //! fv - forward vertical displacement + //! bu - backward horizontal displacement + //! bv - backward vertical displacement + //! pos - new frame position + //! newFrame - new frame + //! buf - temporary buffer, will have width x 6*height size, CV_32FC1 type and contain 6 oclMat; + //! occlusion masks 0, occlusion masks 1, + //! interpolated forward flow 0, interpolated forward flow 1, + //! interpolated backward flow 0, interpolated backward flow 1 + //! + CV_EXPORTS void interpolateFrames(const oclMat &frame0, const oclMat &frame1, + const oclMat &fu, const oclMat &fv, + const oclMat &bu, const oclMat &bv, + float pos, oclMat &newFrame, oclMat &buf); + + //! computes moments of the rasterized shape or a vector of points - CV_EXPORTS Moments ocl_moments(InputArray _array, bool binaryImage); ++ //! _array should be a vector a points standing for the contour ++ CV_EXPORTS Moments ocl_moments(InputArray contour); ++ //! src should be a general image uploaded to the GPU. ++ //! the supported oclMat type are CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1 and CV_64FC1 ++ //! to use type of CV_64FC1, the GPU should support CV_64FC1 ++ CV_EXPORTS Moments ocl_moments(oclMat& src, bool binary); + + class CV_EXPORTS StereoBM_OCL + { + public: + enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 }; + + enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 }; + + //! the default constructor + StereoBM_OCL(); + //! the full constructor taking the camera-specific preset, number of disparities and the SAD window size. ndisparities must be multiple of 8. + StereoBM_OCL(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ); + + //! the stereo correspondence operator. Finds the disparity for the specified rectified stereo pair + //! Output disparity has CV_8U type. + void operator() ( const oclMat &left, const oclMat &right, oclMat &disparity); + + //! Some heuristics that tries to estmate + // if current GPU will be faster then CPU in this algorithm. + // It queries current active device. + static bool checkIfGpuCallReasonable(); + + int preset; + int ndisp; + int winSize; + + // If avergeTexThreshold == 0 => post procesing is disabled + // If avergeTexThreshold != 0 then disparity is set 0 in each point (x,y) where for left image + // SumOfHorizontalGradiensInWindow(x, y, winSize) < (winSize * winSize) * avergeTexThreshold + // i.e. input left image is low textured. + float avergeTexThreshold; + private: + oclMat minSSD, leBuf, riBuf; + }; + + class CV_EXPORTS StereoBeliefPropagation + { + public: + enum { DEFAULT_NDISP = 64 }; + enum { DEFAULT_ITERS = 5 }; + enum { DEFAULT_LEVELS = 5 }; + static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels); + explicit StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, + int iters = DEFAULT_ITERS, + int levels = DEFAULT_LEVELS, + int msg_type = CV_16S); + StereoBeliefPropagation(int ndisp, int iters, int levels, + float max_data_term, float data_weight, + float max_disc_term, float disc_single_jump, + int msg_type = CV_32F); + void operator()(const oclMat &left, const oclMat &right, oclMat &disparity); + void operator()(const oclMat &data, oclMat &disparity); + int ndisp; + int iters; + int levels; + float max_data_term; + float data_weight; + float max_disc_term; + float disc_single_jump; + int msg_type; + private: + oclMat u, d, l, r, u2, d2, l2, r2; + std::vector datas; + oclMat out; + }; + + class CV_EXPORTS StereoConstantSpaceBP + { + public: + enum { DEFAULT_NDISP = 128 }; + enum { DEFAULT_ITERS = 8 }; + enum { DEFAULT_LEVELS = 4 }; + enum { DEFAULT_NR_PLANE = 4 }; + static void estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane); + explicit StereoConstantSpaceBP( + int ndisp = DEFAULT_NDISP, + int iters = DEFAULT_ITERS, + int levels = DEFAULT_LEVELS, + int nr_plane = DEFAULT_NR_PLANE, + int msg_type = CV_32F); + StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, + float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, + int min_disp_th = 0, + int msg_type = CV_32F); + void operator()(const oclMat &left, const oclMat &right, oclMat &disparity); + int ndisp; + int iters; + int levels; + int nr_plane; + float max_data_term; + float data_weight; + float max_disc_term; + float disc_single_jump; + int min_disp_th; + int msg_type; + bool use_local_init_data_cost; + private: + oclMat u[2], d[2], l[2], r[2]; + oclMat disp_selected_pyr[2]; + oclMat data_cost; + oclMat data_cost_selected; + oclMat temp; + oclMat out; + }; + + // Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method + // + // see reference: + // [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow". + // [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation". + class CV_EXPORTS OpticalFlowDual_TVL1_OCL + { + public: + OpticalFlowDual_TVL1_OCL(); + + void operator ()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy); + + void collectGarbage(); + + /** + * Time step of the numerical scheme. + */ + double tau; + + /** + * Weight parameter for the data term, attachment parameter. + * This is the most relevant parameter, which determines the smoothness of the output. + * The smaller this parameter is, the smoother the solutions we obtain. + * It depends on the range of motions of the images, so its value should be adapted to each image sequence. + */ + double lambda; + + /** + * Weight parameter for (u - v)^2, tightness parameter. + * It serves as a link between the attachment and the regularization terms. + * In theory, it should have a small value in order to maintain both parts in correspondence. + * The method is stable for a large range of values of this parameter. + */ + double theta; + + /** + * Number of scales used to create the pyramid of images. + */ + int nscales; + + /** + * Number of warpings per scale. + * Represents the number of times that I1(x+u0) and grad( I1(x+u0) ) are computed per scale. + * This is a parameter that assures the stability of the method. + * It also affects the running time, so it is a compromise between speed and accuracy. + */ + int warps; + + /** + * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time. + * A small value will yield more accurate solutions at the expense of a slower convergence. + */ + double epsilon; + + /** + * Stopping criterion iterations number used in the numerical scheme. + */ + int iterations; + + bool useInitialFlow; + + private: + void procOneScale(const oclMat& I0, const oclMat& I1, oclMat& u1, oclMat& u2); + + std::vector I0s; + std::vector I1s; + std::vector u1s; + std::vector u2s; + + oclMat I1x_buf; + oclMat I1y_buf; + + oclMat I1w_buf; + oclMat I1wx_buf; + oclMat I1wy_buf; + + oclMat grad_buf; + oclMat rho_c_buf; + + oclMat p11_buf; + oclMat p12_buf; + oclMat p21_buf; + oclMat p22_buf; + + oclMat diff_buf; + oclMat norm_buf; + }; + // current supported sorting methods + enum + { + SORT_BITONIC, // only support power-of-2 buffer size + SORT_SELECTION, // cannot sort duplicate keys + SORT_MERGE, + SORT_RADIX // only support signed int/float keys(CV_32S/CV_32F) + }; + //! Returns the sorted result of all the elements in input based on equivalent keys. + // + // The element unit in the values to be sorted is determined from the data type, + // i.e., a CV_32FC2 input {a1a2, b1b2} will be considered as two elements, regardless its + // matrix dimension. + // both keys and values will be sorted inplace + // Key needs to be single channel oclMat. + // + // Example: + // input - + // keys = {2, 3, 1} (CV_8UC1) + // values = {10,5, 4,3, 6,2} (CV_8UC2) + // sortByKey(keys, values, SORT_SELECTION, false); + // output - + // keys = {1, 2, 3} (CV_8UC1) + // values = {6,2, 10,5, 4,3} (CV_8UC2) + CV_EXPORTS void sortByKey(oclMat& keys, oclMat& values, int method, bool isGreaterThan = false); + /*!Base class for MOG and MOG2!*/ + class CV_EXPORTS BackgroundSubtractor + { + public: + //! the virtual destructor + virtual ~BackgroundSubtractor(); + //! the update operator that takes the next video frame and returns the current foreground mask as 8-bit binary image. + virtual void operator()(const oclMat& image, oclMat& fgmask, float learningRate); + + //! computes a background image + virtual void getBackgroundImage(oclMat& backgroundImage) const = 0; + }; + /*! + Gaussian Mixture-based Backbround/Foreground Segmentation Algorithm + + The class implements the following algorithm: + "An improved adaptive background mixture model for real-time tracking with shadow detection" + P. KadewTraKuPong and R. Bowden, + Proc. 2nd European Workshp on Advanced Video-Based Surveillance Systems, 2001." + http://personal.ee.surrey.ac.uk/Personal/R.Bowden/publications/avbs01/avbs01.pdf + */ + class CV_EXPORTS MOG: public cv::ocl::BackgroundSubtractor + { + public: + //! the default constructor + MOG(int nmixtures = -1); + + //! re-initiaization method + void initialize(Size frameSize, int frameType); + + //! the update operator + void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = 0.f); + + //! computes a background image which are the mean of all background gaussians + void getBackgroundImage(oclMat& backgroundImage) const; + + //! releases all inner buffers + void release(); + + int history; + float varThreshold; + float backgroundRatio; + float noiseSigma; + + private: + int nmixtures_; + + Size frameSize_; + int frameType_; + int nframes_; + + oclMat weight_; + oclMat sortKey_; + oclMat mean_; + oclMat var_; + }; + + /*! + The class implements the following algorithm: + "Improved adaptive Gausian mixture model for background subtraction" + Z.Zivkovic + International Conference Pattern Recognition, UK, August, 2004. + http://www.zoranz.net/Publications/zivkovic2004ICPR.pdf + */ + class CV_EXPORTS MOG2: public cv::ocl::BackgroundSubtractor + { + public: + //! the default constructor + MOG2(int nmixtures = -1); + + //! re-initiaization method + void initialize(Size frameSize, int frameType); + + //! the update operator + void operator()(const oclMat& frame, oclMat& fgmask, float learningRate = -1.0f); + + //! computes a background image which are the mean of all background gaussians + void getBackgroundImage(oclMat& backgroundImage) const; + + //! releases all inner buffers + void release(); + + // parameters + // you should call initialize after parameters changes + + int history; + + //! here it is the maximum allowed number of mixture components. + //! Actual number is determined dynamically per pixel + float varThreshold; + // threshold on the squared Mahalanobis distance to decide if it is well described + // by the background model or not. Related to Cthr from the paper. + // This does not influence the update of the background. A typical value could be 4 sigma + // and that is varThreshold=4*4=16; Corresponds to Tb in the paper. + + ///////////////////////// + // less important parameters - things you might change but be carefull + //////////////////////// + + float backgroundRatio; + // corresponds to fTB=1-cf from the paper + // TB - threshold when the component becomes significant enough to be included into + // the background model. It is the TB=1-cf from the paper. So I use cf=0.1 => TB=0. + // For alpha=0.001 it means that the mode should exist for approximately 105 frames before + // it is considered foreground + // float noiseSigma; + float varThresholdGen; + + //correspondts to Tg - threshold on the squared Mahalan. dist. to decide + //when a sample is close to the existing components. If it is not close + //to any a new component will be generated. I use 3 sigma => Tg=3*3=9. + //Smaller Tg leads to more generated components and higher Tg might make + //lead to small number of components but they can grow too large + float fVarInit; + float fVarMin; + float fVarMax; + + //initial variance for the newly generated components. + //It will will influence the speed of adaptation. A good guess should be made. + //A simple way is to estimate the typical standard deviation from the images. + //I used here 10 as a reasonable value + // min and max can be used to further control the variance + float fCT; //CT - complexity reduction prior + //this is related to the number of samples needed to accept that a component + //actually exists. We use CT=0.05 of all the samples. By setting CT=0 you get + //the standard Stauffer&Grimson algorithm (maybe not exact but very similar) + + //shadow detection parameters + bool bShadowDetection; //default 1 - do shadow detection + unsigned char nShadowDetection; //do shadow detection - insert this value as the detection result - 127 default value + float fTau; + // Tau - shadow threshold. The shadow is detected if the pixel is darker + //version of the background. Tau is a threshold on how much darker the shadow can be. + //Tau= 0.5 means that if pixel is more than 2 times darker then it is not shadow + //See: Prati,Mikic,Trivedi,Cucchiarra,"Detecting Moving Shadows...",IEEE PAMI,2003. + + private: + int nmixtures_; + + Size frameSize_; + int frameType_; + int nframes_; + + oclMat weight_; + oclMat variance_; + oclMat mean_; + + oclMat bgmodelUsedModes_; //keep track of number of modes per pixel + }; + + /*!***************Kalman Filter*************!*/ + class CV_EXPORTS KalmanFilter + { + public: + KalmanFilter(); + //! the full constructor taking the dimensionality of the state, of the measurement and of the control vector + KalmanFilter(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F); + //! re-initializes Kalman filter. The previous content is destroyed. + void init(int dynamParams, int measureParams, int controlParams=0, int type=CV_32F); + + const oclMat& predict(const oclMat& control=oclMat()); + const oclMat& correct(const oclMat& measurement); + + oclMat statePre; //!< predicted state (x'(k)): x(k)=A*x(k-1)+B*u(k) + oclMat statePost; //!< corrected state (x(k)): x(k)=x'(k)+K(k)*(z(k)-H*x'(k)) + oclMat transitionMatrix; //!< state transition matrix (A) + oclMat controlMatrix; //!< control matrix (B) (not used if there is no control) + oclMat measurementMatrix; //!< measurement matrix (H) + oclMat processNoiseCov; //!< process noise covariance matrix (Q) + oclMat measurementNoiseCov;//!< measurement noise covariance matrix (R) + oclMat errorCovPre; //!< priori error estimate covariance matrix (P'(k)): P'(k)=A*P(k-1)*At + Q)*/ + oclMat gain; //!< Kalman gain matrix (K(k)): K(k)=P'(k)*Ht*inv(H*P'(k)*Ht+R) + oclMat errorCovPost; //!< posteriori error estimate covariance matrix (P(k)): P(k)=(I-K(k)*H)*P'(k) + private: + oclMat temp1; + oclMat temp2; + oclMat temp3; + oclMat temp4; + oclMat temp5; + }; + + /*!***************K Nearest Neighbour*************!*/ + class CV_EXPORTS KNearestNeighbour: public CvKNearest + { + public: + KNearestNeighbour(); + ~KNearestNeighbour(); + + bool train(const Mat& trainData, Mat& labels, Mat& sampleIdx = Mat().setTo(Scalar::all(0)), + bool isRegression = false, int max_k = 32, bool updateBase = false); + + void clear(); + + void find_nearest(const oclMat& samples, int k, oclMat& lables); + + private: + oclMat samples_ocl; + }; + + /*!*************** SVM *************!*/ + class CV_EXPORTS CvSVM_OCL : public CvSVM + { + public: + CvSVM_OCL(); + + CvSVM_OCL(const cv::Mat& trainData, const cv::Mat& responses, + const cv::Mat& varIdx=cv::Mat(), const cv::Mat& sampleIdx=cv::Mat(), + CvSVMParams params=CvSVMParams()); + CV_WRAP float predict( const int row_index, Mat& src, bool returnDFVal=false ) const; + CV_WRAP void predict( cv::InputArray samples, cv::OutputArray results ) const; + CV_WRAP float predict( const cv::Mat& sample, bool returnDFVal=false ) const; + float predict( const CvMat* samples, CV_OUT CvMat* results ) const; + + protected: + float predict( const int row_index, int row_len, Mat& src, bool returnDFVal=false ) const; + void create_kernel(); + void create_solver(); + }; + + /*!*************** END *************!*/ + } +} +#if defined _MSC_VER && _MSC_VER >= 1200 +# pragma warning( push) +# pragma warning( disable: 4267) +#endif +#include "opencv2/ocl/matrix_operations.hpp" +#if defined _MSC_VER && _MSC_VER >= 1200 +# pragma warning( pop) +#endif + +#endif /* __OPENCV_OCL_HPP__ */ diff --cc modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp index beb3d27,beb3d27..e384544 --- a/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp +++ b/modules/ocl/include/opencv2/ocl/private/opencl_dumpinfo.hpp @@@ -21,7 -21,7 +21,7 @@@ // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation --// and/or other oclMaterials provided with the distribution. ++// and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. diff --cc modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp index 70c45d3,70c45d3..08f980f --- a/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp +++ b/modules/ocl/include/opencv2/ocl/private/opencl_utils.hpp @@@ -21,7 -21,7 +21,7 @@@ // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation --// and/or other oclMaterials provided with the distribution. ++// and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. diff --cc modules/ocl/include/opencv2/ocl/private/util.hpp index 670b03c,88f603b..efb684c --- a/modules/ocl/include/opencv2/ocl/private/util.hpp +++ b/modules/ocl/include/opencv2/ocl/private/util.hpp @@@ -25,7 -25,7 +25,7 @@@ // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation --// and/or other oclMaterials provided with the distribution. ++// and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. @@@ -100,18 -100,22 +100,22 @@@ CV_EXPORTS void openCLFree(void *devPtr CV_EXPORTS cl_mem openCLCreateBuffer(Context *clCxt, size_t flag, size_t size); CV_EXPORTS void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size); CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt, - const cv::ocl::ProgramEntry* source, std::string kernelName); + const cv::ocl::ProgramEntry* source, String kernelName); CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt, - const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options); + const cv::ocl::ProgramEntry* source, String kernelName, const char *build_options); + CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, - string kernelName, int channels, int depth, const char *build_options); ++ String kernelName, int channels, int depth, const char *build_options); CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads); + CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3], + size_t localThreads[3], std::vector< std::pair > &args); -CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair > &args, +CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, String kernelName, std::vector< std::pair > &args, int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1); -CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, +CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3], size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth, const char *build_options); -CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, size_t globalThreads[3], +CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3], size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth); -CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName, size_t globalThreads[3], +CV_EXPORTS void openCLExecuteKernel(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3], size_t localThreads[3], std::vector< std::pair > &args, int channels, int depth, const char *build_options); diff --cc modules/ocl/perf/perf_hough.cpp index f259bd1,0000000..e90356a mode 100644,000000..100644 --- a/modules/ocl/perf/perf_hough.cpp +++ b/modules/ocl/perf/perf_hough.cpp @@@ -1,106 -1,0 +1,106 @@@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. +// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation - // and/or other oclMaterials provided with the distribution. ++// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors as is and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "perf_precomp.hpp" + +#ifdef HAVE_OPENCL + +using namespace cv; +using namespace perf; + +////////////////////////////////////////////////////////////////////// +// HoughCircles + +typedef std::tr1::tuple Size_Dp_MinDist_t; +typedef perf::TestBaseWithParam Size_Dp_MinDist; + +PERF_TEST_P(Size_Dp_MinDist, OCL_HoughCircles, + testing::Combine( + testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p), + testing::Values(1.0f, 2.0f, 4.0f), + testing::Values(1.0f, 10.0f))) +{ + const Size_Dp_MinDist_t params = GetParam(); + const cv::Size size = std::tr1::get<0>(params); + const float dp = std::tr1::get<1>(params); + const float minDist = std::tr1::get<2>(params); + + const int minRadius = 10; + const int maxRadius = 30; + const int cannyThreshold = 100; + const int votesThreshold = 15; + + cv::RNG rng(123456789); + + cv::Mat src(size, CV_8UC1, cv::Scalar::all(0)), circles; + + const int numCircles = rng.uniform(50, 100); + for (int i = 0; i < numCircles; ++i) + { + cv::Point center(rng.uniform(0, src.cols), rng.uniform(0, src.rows)); + const int radius = rng.uniform(minRadius, maxRadius + 1); + + cv::circle(src, center, radius, cv::Scalar::all(255), -1); + } + + declare.time(10.0).iterations(25); + + if (RUN_OCL_IMPL) + { + cv::ocl::oclMat ocl_src(src), ocl_circles; + + OCL_TEST_CYCLE() cv::ocl::HoughCircles(ocl_src, ocl_circles, HOUGH_GRADIENT, dp, minDist, + cannyThreshold, votesThreshold, minRadius, maxRadius); + } + else if (RUN_PLAIN_IMPL) + { + TEST_CYCLE() cv::HoughCircles(src, circles, HOUGH_GRADIENT, dp, minDist, cannyThreshold, + votesThreshold, minRadius, maxRadius); + } + else + OCL_PERF_ELSE + + int value = 0; + SANITY_CHECK(value); +} + +#endif // HAVE_OPENCL diff --cc modules/ocl/src/arithm.cpp index 6bfa733,9b24b16..5bcfbe1 --- a/modules/ocl/src/arithm.cpp +++ b/modules/ocl/src/arithm.cpp @@@ -472,21 -472,25 +472,25 @@@ static void arithmetic_minMax_run(cons const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; const char * const channelMap[] = { " ", " ", "2", "4", "4" }; - ostringstream stream; + std::ostringstream stream; stream << "-D T=" << typeMap[src.depth()] << channelMap[src.channels()]; - stream << " -D MAX_VAL=" << (WT)std::numeric_limits::max(); - stream << " -D MIN_VAL=" << (std::numeric_limits::is_integer ? - (WT)std::numeric_limits::min() : -(WT)(std::numeric_limits::max())); - if (numeric_limits::is_integer) ++ if (std::numeric_limits::is_integer) + { - stream << " -D MAX_VAL=" << (WT)numeric_limits::max(); - stream << " -D MIN_VAL=" << (WT)numeric_limits::min(); ++ stream << " -D MAX_VAL=" << (WT)std::numeric_limits::max(); ++ stream << " -D MIN_VAL=" << (WT)std::numeric_limits::min(); + } + else + stream << " -D DEPTH_" << src.depth(); std::string buildOptions = stream.str(); - vector > args; - args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&offset)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum)); + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum)); int minvalid_cols = 0, moffset = 0; if (!mask.empty()) @@@ -693,83 -697,47 +697,47 @@@ double cv::ocl::norm(const oclMat &src1 ////////////////////////////////// flip ////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// - static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, String kernelName) - { - int channels = dst.oclchannels(); - int depth = dst.depth(); + enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS }; - int vector_lengths[4][7] = {{4, 4, 4, 4, 1, 1, 1}, - {4, 4, 4, 4, 1, 1, 1}, - {4, 4, 4, 4, 1, 1, 1}, - {4, 4, 4, 4, 1, 1, 1} - }; - - size_t vector_length = vector_lengths[channels - 1][depth]; - int offset_cols = ((dst.offset % dst.step) / dst.elemSize1()) & (vector_length - 1); - - int cols = divUp(dst.cols * channels + offset_cols, vector_length); - int rows = divUp(dst.rows, 2); - - size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { cols, rows, 1 }; - - int dst_step1 = dst.cols * dst.elemSize(); - std::vector > args; - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset )); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 )); - - openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args, -1, depth); - } - - static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, String kernelName, bool isVertical) -static void arithmetic_flip_run(const oclMat &src, oclMat &dst, string kernelName, int flipType) ++static void arithmetic_flip_run(const oclMat &src, oclMat &dst, String kernelName, int flipType) { - int channels = dst.oclchannels(); - int depth = dst.depth(); + int cols = dst.cols, rows = dst.rows; + if ((cols == 1 && flipType == FLIP_COLS) || + (rows == 1 && flipType == FLIP_ROWS) || + (rows == 1 && cols == 1 && flipType == FLIP_BOTH)) + { + src.copyTo(dst); + return; + } - int vector_lengths[4][7] = {{1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1} - }; + cols = flipType == FLIP_COLS ? divUp(cols, 2) : cols; + rows = flipType & FLIP_ROWS ? divUp(rows, 2) : rows; - size_t vector_length = vector_lengths[channels - 1][depth]; - int offset_cols = ((dst.offset % dst.step) / dst.elemSize()) & (vector_length - 1); - int cols = divUp(dst.cols + offset_cols, vector_length); - cols = isVertical ? cols : divUp(cols, 2); - int rows = isVertical ? divUp(dst.rows, 2) : dst.rows; + const char * const channelMap[] = { "", "", "2", "4", "4" }; + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; + std::string buildOptions = format("-D T=%s%s", typeMap[dst.depth()], channelMap[dst.oclchannels()]); size_t localThreads[3] = { 64, 4, 1 }; size_t globalThreads[3] = { cols, rows, 1 }; - int dst_step1 = dst.cols * dst.elemSize(); + int elemSize = src.elemSize(); + int src_step = src.step / elemSize, src_offset = src.offset / elemSize; + int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize; + - vector > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src_step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src_offset )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols )); - args.push_back( make_pair( sizeof(cl_int), (void *)&rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.offset )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_step )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&src_offset )); + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.step )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.offset )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows )); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols )); - if (isVertical) - args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows )); - else - args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols )); - - args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1 )); - - const cv::ocl::ProgramEntry* source = isVertical ? &arithm_flip_rc : &arithm_flip; - - openCLExecuteKernel(src.clCxt, source, kernelName, globalThreads, localThreads, args, src.oclchannels(), depth); + openCLExecuteKernel(src.clCxt, &arithm_flip, kernelName, globalThreads, localThreads, args, + -1, -1, buildOptions.c_str()); } void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode) diff --cc modules/ocl/src/blend.cpp index c9bba13,a2b70f0..39f09c4 --- a/modules/ocl/src/blend.cpp +++ b/modules/ocl/src/blend.cpp @@@ -49,35 -49,51 +49,51 @@@ using namespace cv; using namespace cv::ocl; - void cv::ocl::blendLinear(const oclMat &img1, const oclMat &img2, const oclMat &weights1, const oclMat &weights2, - oclMat &result) + void cv::ocl::blendLinear(const oclMat &src1, const oclMat &src2, const oclMat &weights1, const oclMat &weights2, + oclMat &dst) { - cv::ocl::Context *ctx = img1.clCxt; - CV_Assert(ctx == img2.clCxt && ctx == weights1.clCxt && ctx == weights2.clCxt); - int channels = img1.oclchannels(); - int depth = img1.depth(); - int rows = img1.rows; - int cols = img1.cols; - int istep = img1.step1(); - int wstep = weights1.step1(); - size_t globalSize[] = {cols * channels / 4, rows, 1}; - size_t localSize[] = {256, 1, 1}; + CV_Assert(src1.depth() <= CV_32F); + CV_Assert(src1.size() == src2.size() && src1.type() == src2.type()); + CV_Assert(weights1.size() == weights2.size() && weights1.size() == src1.size() && + weights1.type() == CV_32FC1 && weights2.type() == CV_32FC1); + + dst.create(src1.size(), src1.type()); + + size_t globalSize[] = { dst.cols, dst.rows, 1}; + size_t localSize[] = { 16, 16, 1 }; + + int depth = dst.depth(), ocn = dst.oclchannels(); + int src1_step = src1.step / src1.elemSize(), src1_offset = src1.offset / src1.elemSize(); + int src2_step = src2.step / src2.elemSize(), src2_offset = src2.offset / src2.elemSize(); + int weight1_step = weights1.step / weights1.elemSize(), weight1_offset = weights1.offset / weights1.elemSize(); + int weight2_step = weights2.step / weights2.elemSize(), weight2_offset = weights2.offset / weights2.elemSize(); + int dst_step = dst.step / dst.elemSize(), dst_offset = dst.offset / dst.elemSize(); + + const char * const channelMap[] = { "", "", "2", "4", "4" }; + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; + std::string buildOptions = format("-D T=%s%s -D convertToT=convert_%s%s%s -D FT=float%s -D convertToFT=convert_float%s", + typeMap[depth], channelMap[ocn], typeMap[depth], channelMap[ocn], + depth >= CV_32S ? "" : "_sat_rte", channelMap[ocn], channelMap[ocn]); - vector< pair > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1_offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src1_step )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2_offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&src2_step )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&weights1.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&weight1_offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&weight1_step )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&weights2.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&weight2_offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&weight2_step )); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_offset )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows )); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols )); + std::vector< std::pair > args; - result.create(img1.size(), CV_MAKE_TYPE(depth,img1.channels())); - if(globalSize[0] != 0) - { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&result.data )); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img1.data )); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&img2.data )); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights1.data )); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights2.data )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&istep )); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&wstep )); - String kernelName = "BlendLinear"; ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src1.data )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_offset )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&src1_step )); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src2.data )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_offset )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&src2_step )); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights1.data )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_offset )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight1_step )); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&weights2.data )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_offset )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&weight2_step )); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_offset )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.rows )); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst.cols )); - openCLExecuteKernel(ctx, &blend_linear, kernelName, globalSize, localSize, args, channels, depth); - } + openCLExecuteKernel(src1.clCxt, &blend_linear, "blendLinear", globalSize, localSize, args, + -1, -1, buildOptions.c_str()); } diff --cc modules/ocl/src/canny.cpp index 3f5de52,e0d788b..8c68d8b --- a/modules/ocl/src/canny.cpp +++ b/modules/ocl/src/canny.cpp @@@ -78,20 -78,10 +78,11 @@@ void cv::ocl::CannyBuf::create(const Si filterDY = createDerivFilter_GPU(CV_8U, CV_32S, 0, 1, apperture_size, BORDER_REPLICATE); } } - ensureSizeIsEnough(2 * (image_size.height + 2), image_size.width + 2, CV_32FC1, edgeBuf); + ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, magBuf); + ensureSizeIsEnough(image_size.height + 2, image_size.width + 2, CV_32FC1, mapBuf); - ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf1); - ensureSizeIsEnough(1, image_size.width * image_size.height, CV_16UC2, trackBuf2); - - int counter_i [1] = { 0 }; - int err = 0; - if(counter) - { - openCLFree(counter); - } - counter = clCreateBuffer( *((cl_context*)getClContextPtr()), CL_MEM_COPY_HOST_PTR, sizeof(int), counter_i, &err ); - openCLSafeCall(err); + ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf1); + ensureSizeIsEnough(1, image_size.area(), CV_16UC2, trackBuf2); } void cv::ocl::CannyBuf::release() @@@ -100,15 -90,9 +91,10 @@@ dy.release(); dx_buf.release(); dy_buf.release(); - edgeBuf.release(); + magBuf.release(); + mapBuf.release(); trackBuf1.release(); trackBuf2.release(); - if(counter) - { - openCLFree(counter); - counter = NULL; - } } namespace cv @@@ -320,54 -312,61 +306,61 @@@ void canny::calcMap_gpu(oclMat &dx, ocl openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); } - void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, void *counter, int rows, int cols) + void canny::edgesHysteresisLocal_gpu(oclMat &map, oclMat &st1, oclMat& counter, int rows, int cols) { Context *clCxt = map.clCxt; - String kernelName = "edgesHysteresisLocal"; - vector< pair > args; + std::vector< std::pair > args; + Mat counterMat(counter.rows, counter.cols, counter.type()); + counterMat.at(0, 0) = 0; + counter.upload(counterMat); + - args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data)); + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data)); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows)); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset)); + cl_int stepBytes = map.step; - args.push_back( make_pair( sizeof(cl_int), (void *)&stepBytes)); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&stepBytes)); + cl_int offsetBytes = map.offset; - args.push_back( make_pair( sizeof(cl_int), (void *)&offsetBytes)); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&offsetBytes)); size_t globalThreads[3] = {cols, rows, 1}; size_t localThreads[3] = {16, 16, 1}; - openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); + openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisLocal", globalThreads, localThreads, args, -1, -1); } - void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, void *counter, int rows, int cols) + void canny::edgesHysteresisGlobal_gpu(oclMat &map, oclMat &st1, oclMat &st2, oclMat& counter, int rows, int cols) { - unsigned int count; - openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(float), &count, 0, NULL, NULL)); Context *clCxt = map.clCxt; - String kernelName = "edgesHysteresisGlobal"; - vector< pair > args; + std::vector< std::pair > args; size_t localThreads[3] = {128, 1, 1}; - int count_i[1] = {0}; - while(count > 0) + while(1 > 0) { - openCLSafeCall(clEnqueueWriteBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count_i, 0, NULL, NULL)); + Mat counterMat; counter.download(counterMat); + int count = counterMat.at(0, 0); + CV_Assert(count >= 0); + if (count == 0) + break; + + counterMat.at(0, 0) = 0; + counter.upload(counterMat); args.clear(); - size_t globalThreads[3] = {std::min(count, 65535u) * 128, divUp(count, 65535), 1}; + size_t globalThreads[3] = {std::min((unsigned)count, 65535u) * 128, divUp(count, 65535), 1}; - args.push_back( make_pair( sizeof(cl_mem), (void *)&map.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&st1.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&st2.data)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&counter.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&rows)); - args.push_back( make_pair( sizeof(cl_int), (void *)&cols)); - args.push_back( make_pair( sizeof(cl_int), (void *)&count)); - args.push_back( make_pair( sizeof(cl_int), (void *)&map.step)); - args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset)); + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&map.data)); + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st1.data)); + args.push_back( std::make_pair( sizeof(cl_mem), (void *)&st2.data)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&counter.data)); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&rows)); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols)); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&count)); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.step)); + args.push_back( std::make_pair( sizeof(cl_int), (void *)&map.offset)); - openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1); - openCLSafeCall(clEnqueueReadBuffer(*(cl_command_queue*)getClCommandQueuePtr(), (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL)); + openCLExecuteKernel(clCxt, &imgproc_canny, "edgesHysteresisGlobal", globalThreads, localThreads, args, -1, -1); std::swap(st1, st2); } } diff --cc modules/ocl/src/cl_operations.cpp index f83220d,d344689..5910d05 --- a/modules/ocl/src/cl_operations.cpp +++ b/modules/ocl/src/cl_operations.cpp @@@ -174,10 -224,62 +224,62 @@@ void openCLCopyBuffer2D(Context *ctx, v void openCLFree(void *devPtr) { + #ifdef CHECK_MEMORY_CORRUPTION + bool failBefore = false, failAfter = false; + CheckBuffers data; + std::map::iterator i = __check_buffers.find((cl_mem)devPtr); + if (i != __check_buffers.end()) + { + data = i->second; + Context* ctx = Context::getContext(); + std::vector checkBefore(__memory_corruption_check_bytes); + std::vector checkAfter(__memory_corruption_check_bytes); + openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx), + data.mainBuffer, CL_TRUE, 0, __memory_corruption_check_bytes, &checkBefore[0], + 0, NULL, NULL)); + openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(ctx), + data.mainBuffer, CL_TRUE, __memory_corruption_check_bytes + data.size, __memory_corruption_check_bytes, &checkAfter[0], + 0, NULL, NULL)); + + std::vector tmp(__memory_corruption_check_bytes / sizeof(int), + __memory_corruption_check_pattern); + + if (memcmp(&checkBefore[0], &tmp[0], __memory_corruption_check_bytes) != 0) + { + failBefore = true; + } + if (memcmp(&checkAfter[0], &tmp[0], __memory_corruption_check_bytes) != 0) + { + failAfter = true; + } + openCLSafeCall(clReleaseMemObject(data.mainBuffer)); + __check_buffers.erase(i); + } + #endif openCLSafeCall(clReleaseMemObject((cl_mem)devPtr)); + #ifdef CHECK_MEMORY_CORRUPTION + if (failBefore) + { + #ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR + std::cerr << "ERROR: Memory corruption detected: before buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl; + #endif + #ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR + CV_Error(CV_StsInternal, "Memory corruption detected: before buffer"); + #endif + } + if (failAfter) + { + #ifdef CHECK_MEMORY_CORRUPTION_PRINT_ERROR + std::cerr << "ERROR: Memory corruption detected: after buffer: " << cv::format("widthInBytes=%d height=%d", (int)data.widthInBytes, (int)data.height) << std::endl; + #endif + #ifdef CHECK_MEMORY_CORRUPTION_RAISE_ERROR + CV_Error(CV_StsInternal, "Memory corruption detected: after buffer"); + #endif + } + #endif } -cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName) +cl_kernel openCLGetKernelFromSource(const Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName) { return openCLGetKernelFromSource(ctx, source, kernelName, NULL); } @@@ -234,8 -336,7 +336,7 @@@ static std::string removeDuplicatedWhit return opt; } - void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3], - size_t localThreads[3], std::vector< std::pair > &args, int channels, -cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, int channels, ++cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, int channels, int depth, const char *build_options) { //construct kernel name @@@ -246,12 -347,16 +347,16 @@@ idxStr << "_C" << channels; if(depth != -1) idxStr << "_D" << depth; - kernelName += idxStr.str(); + kernelName = kernelName + idxStr.str(); - cl_kernel kernel; std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options); - kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str()); + cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str()); + return kernel; + } + void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3], - size_t localThreads[3], vector< pair > &args) ++ size_t localThreads[3], std::vector< std::pair > &args) + { if ( localThreads != NULL) { globalThreads[0] = roundUp(globalThreads[0], localThreads[0]); @@@ -297,9 -402,18 +402,18 @@@ openCLSafeCall(clReleaseKernel(kernel)); } -void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3], - size_t localThreads[3], vector< pair > &args, int channels, ++void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3], ++ size_t localThreads[3], std::vector< std::pair > &args, int channels, + int depth, const char *build_options) + { + cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options); + + openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args); + } + -void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, +void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3], size_t localThreads[3], - vector< pair > &args, int channels, int depth) + std::vector< std::pair > &args, int channels, int depth) { openCLExecuteKernel(ctx, source, kernelName, globalThreads, localThreads, args, channels, depth, NULL); diff --cc modules/ocl/src/filtering.cpp index 816988d,59146c1..305c723 --- a/modules/ocl/src/filtering.cpp +++ b/modules/ocl/src/filtering.cpp @@@ -452,7 -423,7 +424,8 @@@ void morphOp(int op, const oclMat &src else kernel = _kernel; - Ptr f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); - Ptr f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations); ++ Ptr f = createMorphologyFilter_GPU(op, src.type(), kernel, anchor, iterations) ++ .staticCast(); f->apply(src, dst); } @@@ -550,99 -547,165 +549,165 @@@ static void GPUFilter2D(const oclMat &s CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && (src.rows == dst.rows)); - CV_Assert((src.oclchannels() == dst.oclchannels())); - CV_Assert(ksize.height > 0 && ksize.width > 0 && ((ksize.height & 1) == 1) && ((ksize.width & 1) == 1)); - CV_Assert((anchor.x == -1 && anchor.y == -1) || (anchor.x == ksize.width >> 1 && anchor.y == ksize.height >> 1)); - CV_Assert(ksize.width == ksize.height); - Context *clCxt = src.clCxt; - - int filterWidth = ksize.width; - bool ksize_3x3 = filterWidth == 3 && src.type() != CV_32FC4 && src.type() != CV_32FC3; // CV_32FC4 is not tuned up with filter2d_3x3 kernel + CV_Assert(src.oclchannels() == dst.oclchannels()); - String kernelName = ksize_3x3 ? "filter2D_3x3" : "filter2D"; + CV_Assert(kernel.cols == ksize.width && kernel.rows == ksize.height); + CV_Assert(kernel.channels() == 1); - size_t src_offset_x = (src.offset % src.step) / src.elemSize(); - size_t src_offset_y = src.offset / src.step; + CV_Assert(anchor.x >= 0 && anchor.x < kernel.cols); + CV_Assert(anchor.y >= 0 && anchor.y < kernel.rows); - size_t dst_offset_x = (dst.offset % dst.step) / dst.elemSize(); - size_t dst_offset_y = dst.offset / dst.step; + bool useDouble = src.depth() == CV_64F; - int paddingPixels = filterWidth & (-2); + std::vector kernelDataFloat; + std::vector kernelDataDouble; + int kernel_size_y2_aligned = useDouble ? + _prepareKernelFilter2D(kernelDataDouble, kernel) + : _prepareKernelFilter2D(kernelDataFloat, kernel); + oclMat oclKernelParameter; + if (useDouble) + { + oclKernelParameter.createEx(1, kernelDataDouble.size(), CV_64FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT); + openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataDouble.size()*sizeof(double), + &kernelDataDouble[0], kernelDataDouble.size()*sizeof(double), + kernelDataDouble.size()*sizeof(double), 1, clMemcpyHostToDevice); + } + else + { + oclKernelParameter.createEx(1, kernelDataFloat.size(), CV_32FC1, DEVICE_MEM_R_ONLY, DEVICE_MEM_DEFAULT); + openCLMemcpy2D(src.clCxt, oclKernelParameter.data, kernelDataFloat.size()*sizeof(float), + &kernelDataFloat[0], kernelDataFloat.size()*sizeof(float), + kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice); + } - size_t localThreads[3] = {ksize_3x3 ? 256 : 16, ksize_3x3 ? 1 : 16, 1}; - size_t globalThreads[3] = {src.wholecols, src.wholerows, 1}; + size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; + do { + size_t BLOCK_SIZE = tryWorkItems; + while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2) + BLOCK_SIZE /= 2; + #if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices + size_t BLOCK_SIZE_Y = 1; + #else + size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices + while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) + BLOCK_SIZE_Y *= 2; + #endif + + CV_Assert((size_t)ksize.width <= BLOCK_SIZE); + + bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; + - vector > args; ++ std::vector > args; + - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data)); + cl_uint stepBytes = src.step; - args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); ++ args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes)); + int offsetXBytes = src.offset % src.step; + int offsetX = offsetXBytes / src.elemSize(); + CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); + int offsetY = src.offset / src.step; + int endX = (offsetX + src.cols); + int endY = (offsetY + src.rows); + cl_int rect[4] = {offsetX, offsetY, endX, endY}; + if (!isIsolatedBorder) + { + rect[2] = src.wholecols; + rect[3] = src.wholerows; + } - args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); ++ args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0])); + - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data)); + cl_uint _stepBytes = dst.step; - args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); ++ args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes)); + int _offsetXBytes = dst.offset % dst.step; + int _offsetX = _offsetXBytes / dst.elemSize(); + CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); + int _offsetY = dst.offset / dst.step; + int _endX = (_offsetX + dst.cols); + int _endY = (_offsetY + dst.rows); + cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; - args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); ++ args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); + + float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body + double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body + if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) + { + if (useDouble) - args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); ++ args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); + else - args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); ++ args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); + } - int cn = src.oclchannels(); - int src_step = (int)(src.step/src.elemSize()); - int dst_step = (int)(dst.step/src.elemSize()); - args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data)); - int localWidth = localThreads[0] + paddingPixels; - int localHeight = localThreads[1] + paddingPixels; + const char* btype = NULL; - size_t localMemSize = ksize_3x3 ? 260 * 6 * src.elemSize() : (localWidth * localHeight) * src.elemSize(); + switch (borderType & ~BORDER_ISOLATED) + { + case BORDER_CONSTANT: + btype = "BORDER_CONSTANT"; + break; + case BORDER_REPLICATE: + btype = "BORDER_REPLICATE"; + break; + case BORDER_REFLECT: + btype = "BORDER_REFLECT"; + break; + case BORDER_WRAP: + CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); + return; + case BORDER_REFLECT101: + btype = "BORDER_REFLECT_101"; + break; + } - int vector_lengths[4][7] = {{4, 4, 4, 4, 4, 4, 4}, - {4, 4, 1, 1, 1, 1, 1}, - {1, 1, 1, 1, 1, 1, 1}, - {4, 4, 4, 4, 1, 1, 4} - }; - int cols = dst.cols + ((dst_offset_x) & (vector_lengths[cn - 1][src.depth()] - 1)); + int requiredTop = anchor.y; + int requiredLeft = BLOCK_SIZE; // not this: anchor.x; + int requiredBottom = ksize.height - 1 - anchor.y; + int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; + int h = isIsolatedBorder ? src.rows : src.wholerows; + int w = isIsolatedBorder ? src.cols : src.wholecols; + bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; + + char build_options[1024]; + sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d " + "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d " + "-D %s -D %s -D %s", + (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, + src.depth(), src.oclchannels(), useDouble ? 1 : 0, + anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned, + btype, + extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", + isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); + + size_t lt[3] = {BLOCK_SIZE, 1, 1}; + size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}; + + cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options); + + size_t kernelWorkGroupSize; + openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt), + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); + if (lt[0] > kernelWorkGroupSize) + { + clReleaseKernel(kernel); + CV_Assert(BLOCK_SIZE > kernelWorkGroupSize); + tryWorkItems = kernelWorkGroupSize; + continue; + } - std::vector< std::pair > args; - args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dst.data)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_step)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_step)); - args.push_back(std::make_pair(sizeof(cl_mem), (void *)&mat_kernel.data)); - args.push_back(std::make_pair(localMemSize, (void *)NULL)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_x)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset_y)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_x)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst_offset_y)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.cols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&cols)); - char btype[30]; - switch (borderType) - { - case 0: - sprintf(btype, "BORDER_CONSTANT"); - break; - case 1: - sprintf(btype, "BORDER_REPLICATE"); - break; - case 2: - sprintf(btype, "BORDER_REFLECT"); - break; - case 3: - CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!"); - return; - case 4: - sprintf(btype, "BORDER_REFLECT_101"); - break; - } - int type = src.depth(); - char build_options[150]; - sprintf(build_options, "-D %s -D IMG_C_%d_%d -D CN=%d -D FILTER_SIZE=%d", btype, cn, type, cn, ksize.width); - openCLExecuteKernel(clCxt, &filtering_laplacian, kernelName, globalThreads, localThreads, args, -1, -1, build_options); + openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here + } while (false); } - Ptr cv::ocl::getLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Size &ksize, + Ptr cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize, const Point &anchor, int borderType) { - static const GPUFilter2D_t GPUFilter2D_callers[] = {0, GPUFilter2D, 0, GPUFilter2D, GPUFilter2D}; - - CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 || srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType); - - oclMat gpu_krnl; Point norm_archor = anchor; - normalizeKernel(kernel, gpu_krnl, CV_32FC1); normalizeAnchor(norm_archor, ksize); - return makePtr(ksize, anchor, gpu_krnl, GPUFilter2D_callers[CV_MAT_CN(srcType)], - borderType); + return Ptr(new LinearFilter_GPU(ksize, norm_archor, kernel, GPUFilter2D, + borderType)); } Ptr cv::ocl::createLinearFilter_GPU(int srcType, int dstType, const Mat &kernel, const Point &anchor, @@@ -711,15 -776,10 +778,10 @@@ public Ptr cv::ocl::createSeparableFilter_GPU(const Ptr &rowFilter, const Ptr &columnFilter) { - return Ptr(new SeparableFilterEngine_GPU(rowFilter, columnFilter)); + return makePtr(rowFilter, columnFilter); } - /* - **data type supported: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4 - **support four border types: BORDER_CONSTANT, BORDER_REPLICATE, BORDER_REFLECT, BORDER_REFLECT_101 - */ - - static void GPUFilterBox_8u_C1R(const oclMat &src, oclMat &dst, + static void GPUFilterBox(const oclMat &src, oclMat &dst, Size &ksize, const Point anchor, const int borderType) { //Normalize the result by default @@@ -728,262 -788,137 +790,137 @@@ CV_Assert(src.clCxt == dst.clCxt); CV_Assert((src.cols == dst.cols) && (src.rows == dst.rows)); - Context *clCxt = src.clCxt; - - String kernelName = "boxFilter_C1_D0"; - - char btype[30]; - - switch (borderType) - { - case 0: - sprintf(btype, "BORDER_CONSTANT"); - break; - case 1: - sprintf(btype, "BORDER_REPLICATE"); - break; - case 2: - sprintf(btype, "BORDER_REFLECT"); - break; - case 3: - CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!"); - return; - case 4: - sprintf(btype, "BORDER_REFLECT_101"); - break; - } - - char build_options[150]; - sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype); - - size_t blockSizeX = 256, blockSizeY = 1; - size_t gSize = blockSizeX - (ksize.width - 1); - size_t threads = (dst.offset % dst.step % 4 + dst.cols + 3) / 4; - size_t globalSizeX = threads % gSize == 0 ? threads / gSize * blockSizeX : (threads / gSize + 1) * blockSizeX; - size_t globalSizeY = ((dst.rows + 1) / 2) % blockSizeY == 0 ? ((dst.rows + 1) / 2) : (((dst.rows + 1) / 2) / blockSizeY + 1) * blockSizeY; - - size_t globalThreads[3] = { globalSizeX, globalSizeY, 1 }; - size_t localThreads[3] = { blockSizeX, blockSizeY, 1 }; - - std::vector > args; - args.push_back(std::make_pair(sizeof(cl_mem), &src.data)); - args.push_back(std::make_pair(sizeof(cl_mem), &dst.data)); - args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step)); - - openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options); - } - - static void GPUFilterBox_8u_C4R(const oclMat &src, oclMat &dst, - Size &ksize, const Point anchor, const int borderType) - { - //Normalize the result by default - float alpha = ksize.height * ksize.width; - - CV_Assert(src.clCxt == dst.clCxt); - CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows)); - Context *clCxt = src.clCxt; - - String kernelName = "boxFilter_C4_D0"; - - char btype[30]; - - switch (borderType) - { - case 0: - sprintf(btype, "BORDER_CONSTANT"); - break; - case 1: - sprintf(btype, "BORDER_REPLICATE"); - break; - case 2: - sprintf(btype, "BORDER_REFLECT"); - break; - case 3: - CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!"); - return; - case 4: - sprintf(btype, "BORDER_REFLECT_101"); - break; - } - - char build_options[150]; - sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype); - - size_t blockSizeX = 256, blockSizeY = 1; - size_t gSize = blockSizeX - ksize.width / 2 * 2; - size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX; - size_t rows_per_thread = 2; - size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY; - - size_t globalThreads[3] = { globalSizeX, globalSizeY, 1}; - size_t localThreads[3] = { blockSizeX, blockSizeY, 1}; - - std::vector > args; - args.push_back(std::make_pair(sizeof(cl_mem), &src.data)); - args.push_back(std::make_pair(sizeof(cl_mem), &dst.data)); - args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step)); - - openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options); - } - - static void GPUFilterBox_32F_C1R(const oclMat &src, oclMat &dst, - Size &ksize, const Point anchor, const int borderType) - { - //Normalize the result by default - float alpha = ksize.height * ksize.width; - - CV_Assert(src.clCxt == dst.clCxt); - CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows)); - Context *clCxt = src.clCxt; - - String kernelName = "boxFilter_C1_D5"; - - char btype[30]; - - switch (borderType) - { - case 0: - sprintf(btype, "BORDER_CONSTANT"); - break; - case 1: - sprintf(btype, "BORDER_REPLICATE"); - break; - case 2: - sprintf(btype, "BORDER_REFLECT"); - break; - case 3: - CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!"); - return; - case 4: - sprintf(btype, "BORDER_REFLECT_101"); - break; - } - - char build_options[150]; - sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype); - - size_t blockSizeX = 256, blockSizeY = 1; - size_t gSize = blockSizeX - ksize.width / 2 * 2; - size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX; - size_t rows_per_thread = 2; - size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY; - - - size_t globalThreads[3] = { globalSizeX, globalSizeY, 1}; - size_t localThreads[3] = { blockSizeX, blockSizeY, 1}; - - std::vector > args; - args.push_back(std::make_pair(sizeof(cl_mem), &src.data)); - args.push_back(std::make_pair(sizeof(cl_mem), &dst.data)); - args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step)); - - openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options); - } - - static void GPUFilterBox_32F_C4R(const oclMat &src, oclMat &dst, - Size &ksize, const Point anchor, const int borderType) - { - //Normalize the result by default - float alpha = ksize.height * ksize.width; - - CV_Assert(src.clCxt == dst.clCxt); - CV_Assert((src.cols == dst.cols) && - (src.rows == dst.rows)); - Context *clCxt = src.clCxt; - - String kernelName = "boxFilter_C4_D5"; - - char btype[30]; - - switch (borderType) - { - case 0: - sprintf(btype, "BORDER_CONSTANT"); - break; - case 1: - sprintf(btype, "BORDER_REPLICATE"); - break; - case 2: - sprintf(btype, "BORDER_REFLECT"); - break; - case 3: - CV_Error(Error::StsUnsupportedFormat, "BORDER_WRAP is not supported!"); - return; - case 4: - sprintf(btype, "BORDER_REFLECT_101"); - break; - } + CV_Assert(src.oclchannels() == dst.oclchannels()); - char build_options[150]; - sprintf(build_options, "-D anX=%d -D anY=%d -D ksX=%d -D ksY=%d -D %s", anchor.x, anchor.y, ksize.width, ksize.height, btype); + size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0]; + do { + size_t BLOCK_SIZE = tryWorkItems; + while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2) + BLOCK_SIZE /= 2; + size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices + while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows) + BLOCK_SIZE_Y *= 2; + + CV_Assert((size_t)ksize.width <= BLOCK_SIZE); + + bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0; + - vector > args; ++ std::vector > args; + - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data)); + cl_uint stepBytes = src.step; - args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes)); ++ args.push_back( std::make_pair( sizeof(cl_uint), (void *)&stepBytes)); + int offsetXBytes = src.offset % src.step; + int offsetX = offsetXBytes / src.elemSize(); + CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes); + int offsetY = src.offset / src.step; + int endX = (offsetX + src.cols); + int endY = (offsetY + src.rows); + cl_int rect[4] = {offsetX, offsetY, endX, endY}; + if (!isIsolatedBorder) + { + rect[2] = src.wholecols; + rect[3] = src.wholerows; + } - args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0])); ++ args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&rect[0])); + - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst.data)); + cl_uint _stepBytes = dst.step; - args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes)); ++ args.push_back( std::make_pair( sizeof(cl_uint), (void *)&_stepBytes)); + int _offsetXBytes = dst.offset % dst.step; + int _offsetX = _offsetXBytes / dst.elemSize(); + CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes); + int _offsetY = dst.offset / dst.step; + int _endX = (_offsetX + dst.cols); + int _endY = (_offsetY + dst.rows); + cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY}; - args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); ++ args.push_back( std::make_pair( sizeof(cl_int)*4, (void *)&_rect[0])); + + bool useDouble = src.depth() == CV_64F; + + float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body + double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body + if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT) + { + if (useDouble) - args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); ++ args.push_back( std::make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0])); + else - args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); ++ args.push_back( std::make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0])); + } - size_t blockSizeX = 256, blockSizeY = 1; - size_t gSize = blockSizeX - ksize.width / 2 * 2; - size_t globalSizeX = (src.cols) % gSize == 0 ? src.cols / gSize * blockSizeX : (src.cols / gSize + 1) * blockSizeX; - size_t rows_per_thread = 2; - size_t globalSizeY = ((src.rows + rows_per_thread - 1) / rows_per_thread) % blockSizeY == 0 ? ((src.rows + rows_per_thread - 1) / rows_per_thread) : (((src.rows + rows_per_thread - 1) / rows_per_thread) / blockSizeY + 1) * blockSizeY; + double alphaDouble = alpha; // DON'T move into 'if' body + if (useDouble) - args.push_back( make_pair( sizeof(double), (void *)&alphaDouble)); ++ args.push_back( std::make_pair( sizeof(double), (void *)&alphaDouble)); + else - args.push_back( make_pair( sizeof(float), (void *)&alpha)); ++ args.push_back( std::make_pair( sizeof(float), (void *)&alpha)); + const char* btype = NULL; - size_t globalThreads[3] = { globalSizeX, globalSizeY, 1}; - size_t localThreads[3] = { blockSizeX, blockSizeY, 1}; + switch (borderType & ~BORDER_ISOLATED) + { + case BORDER_CONSTANT: + btype = "BORDER_CONSTANT"; + break; + case BORDER_REPLICATE: + btype = "BORDER_REPLICATE"; + break; + case BORDER_REFLECT: + btype = "BORDER_REFLECT"; + break; + case BORDER_WRAP: + CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!"); + return; + case BORDER_REFLECT101: + btype = "BORDER_REFLECT_101"; + break; + } - std::vector > args; - args.push_back(std::make_pair(sizeof(cl_mem), &src.data)); - args.push_back(std::make_pair(sizeof(cl_mem), &dst.data)); - args.push_back(std::make_pair(sizeof(cl_float), (void *)&alpha)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholerows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.wholecols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.step)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.cols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&dst.step)); + int requiredTop = anchor.y; + int requiredLeft = BLOCK_SIZE; // not this: anchor.x; + int requiredBottom = ksize.height - 1 - anchor.y; + int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; + int h = isIsolatedBorder ? src.rows : src.wholerows; + int w = isIsolatedBorder ? src.cols : src.wholecols; + bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; + + CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well + + char build_options[1024]; + sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s", + (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y, + src.depth(), src.oclchannels(), useDouble ? 1 : 0, + anchor.x, anchor.y, ksize.width, ksize.height, + btype, + extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", + isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED"); + + size_t lt[3] = {BLOCK_SIZE, 1, 1}; + size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}; + + cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options); + + size_t kernelWorkGroupSize; + openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt), + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0)); + if (lt[0] > kernelWorkGroupSize) + { + clReleaseKernel(kernel); + CV_Assert(BLOCK_SIZE > kernelWorkGroupSize); + tryWorkItems = kernelWorkGroupSize; + continue; + } - openCLExecuteKernel(clCxt, &filtering_boxFilter, kernelName, globalThreads, localThreads, args, -1, -1, build_options); + openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here + } while (false); } - - Ptr cv::ocl::getBoxFilter_GPU(int srcType, int dstType, + Ptr cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/, const Size &ksize, Point anchor, int borderType) { - static const FilterBox_t FilterBox_callers[2][5] = {{0, GPUFilterBox_8u_C1R, 0, GPUFilterBox_8u_C4R, GPUFilterBox_8u_C4R}, - {0, GPUFilterBox_32F_C1R, 0, GPUFilterBox_32F_C4R, GPUFilterBox_32F_C4R} - }; - //Remove this check if more data types need to be supported. - CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC3 || srcType == CV_8UC4 || srcType == CV_32FC1 || - srcType == CV_32FC3 || srcType == CV_32FC4) && dstType == srcType); - normalizeAnchor(anchor, ksize); - return makePtr(ksize, anchor, - borderType, FilterBox_callers[(CV_MAT_DEPTH(srcType) == CV_32F)][CV_MAT_CN(srcType)]); + return Ptr(new GPUBoxFilter(ksize, anchor, + borderType, GPUFilterBox)); } Ptr cv::ocl::createBoxFilter_GPU(int srcType, int dstType, @@@ -1373,11 -1308,14 +1310,14 @@@ void cv::ocl::Scharr(const oclMat &src sepFilter2D(src, dst, ddepth, kx, ky, Point(-1, -1), delta, bordertype); } - void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale) + void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, double scale, + double delta, int borderType) { + CV_Assert(delta == 0); + if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F) { - CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double"); + CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double"); return; } diff --cc modules/ocl/src/haar.cpp index 8116496,31f6742..fd67daf --- a/modules/ocl/src/haar.cpp +++ b/modules/ocl/src/haar.cpp @@@ -831,34 -831,156 +831,156 @@@ void OclCascadeClassifier::detectMultiS pq.s[3] = gcascade->pq3; float correction = gcascade->inv_window_area; - vector > args; - args.push_back ( make_pair(sizeof(cl_mem) , (void *)&stagebuffer )); - args.push_back ( make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer )); - args.push_back ( make_pair(sizeof(cl_mem) , (void *)&nodebuffer )); - args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsum.data )); - args.push_back ( make_pair(sizeof(cl_mem) , (void *)&gsqsum.data )); - args.push_back ( make_pair(sizeof(cl_mem) , (void *)&candidatebuffer )); - args.push_back ( make_pair(sizeof(cl_int) , (void *)&pixelstep )); - args.push_back ( make_pair(sizeof(cl_int) , (void *)&loopcount )); - args.push_back ( make_pair(sizeof(cl_int) , (void *)&startstage )); - args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitstage )); - args.push_back ( make_pair(sizeof(cl_int) , (void *)&endstage )); - args.push_back ( make_pair(sizeof(cl_int) , (void *)&startnode )); - args.push_back ( make_pair(sizeof(cl_int) , (void *)&splitnode )); - args.push_back ( make_pair(sizeof(cl_int4) , (void *)&p )); - args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq )); - args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction )); + std::vector > args; + args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&stagebuffer )); + args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&scaleinfobuffer )); + args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&nodebuffer )); + args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsum.data )); + args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&gsqsum.data )); + args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&candidatebuffer )); + args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&pixelstep )); + args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&loopcount )); + args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startstage )); + args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitstage )); + args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&endstage )); + args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&startnode )); + args.push_back ( std::make_pair(sizeof(cl_int) , (void *)&splitnode )); + args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&p )); + args.push_back ( std::make_pair(sizeof(cl_int4) , (void *)&pq )); + args.push_back ( std::make_pair(sizeof(cl_float) , (void *)&correction )); - const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0"; + if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE)) + { + //setup local group size + localThreads[0] = 8; + localThreads[1] = 16; + localThreads[2] = 1; + + //init maximal number of workgroups + int WGNumX = 1+(sizev[0].width /(localThreads[0])); + int WGNumY = 1+(sizev[0].height/(localThreads[1])); + int WGNumZ = loopcount; + int WGNum = 0; //accurate number of non -empty workgroups + oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U); + { + cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status); + openCLVerifyCall(status); + for(int z=0;z> 16)&0xFFFF; + int Height = (scaleinfo[z].width_height >> 0 )& 0xFFFF; + for(int y=0;y=(Height-cascade->orig_window_size.height)) + continue; // no data to process + for(int x=0;x=(Width-cascade->orig_window_size.width)) + continue; // no data to process + + // save no-empty workgroup info into array + pWGInfo[WGNum].s[0] = scaleinfo[z].width_height; + pWGInfo[WGNum].s[1] = (gx << 16) | gy; + pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff; + memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float)); + WGNum++; + } + } + } + openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,pWGInfo,0,0,0)); + pWGInfo = NULL; + } - openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options); + // setup global sizes to have linear array of workgroups with WGNum size + globalThreads[0] = localThreads[0]*WGNum; + globalThreads[1] = localThreads[1]; + globalThreads[2] = 1; - openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); + #define NODE_SIZE 12 + // pack node info to have less memory loads + oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U); + { + cl_int status; + cl_int* pNodesPK = (cl_int*)clEnqueueMapBuffer(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,true,CL_MAP_WRITE, 0, oclNodesPK.step, 0,0,0,&status); + openCLVerifyCall(status); + //use known local data stride to precalulate indexes + int DATA_SIZE_X = (localThreads[0]+cascade->orig_window_size.width); + // check that maximal value is less than maximal unsigned short + assert(DATA_SIZE_X*cascade->orig_window_size.height+cascade->orig_window_size.width < USHRT_MAX); + for(int i = 0;islm_index[k][0] = (unsigned short)(p[1]*DATA_SIZE_X+p[0]); + pOut->slm_index[k][1] = (unsigned short)(p[1]*DATA_SIZE_X+p[2]); + pOut->slm_index[k][2] = (unsigned short)(p[3]*DATA_SIZE_X+p[0]); + pOut->slm_index[k][3] = (unsigned short)(p[3]*DATA_SIZE_X+p[2]); + } + //store used float point values for each node + pOut->weight[0] = node[i].weight[0]; + pOut->weight[1] = node[i].weight[1]; + pOut->weight[2] = node[i].weight[2]; + pOut->threshold = node[i].threshold; + pOut->alpha[0] = node[i].alpha[0]; - pOut->alpha[1] = node[i].alpha[1]; ++ pOut->alpha[1] = node[i].alpha[1]; + } + openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,pNodesPK,0,0,0)); + pNodesPK = NULL; + } + // add 2 additional buffers (WGinfo and packed nodes) as 2 last args - args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart )); - args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart )); ++ args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart )); ++ args.push_back ( std::make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart )); + + //form build options for kernel - string options = "-D PACKED_CLASSIFIER"; - options += format(" -D NODE_SIZE=%d",NODE_SIZE); - options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width); - options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height); - options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based); - options += format(" -D LSx=%d",localThreads[0]); - options += format(" -D LSy=%d",localThreads[1]); - options += format(" -D SPLITNODE=%d",splitnode); - options += format(" -D SPLITSTAGE=%d",splitstage); - options += format(" -D OUTPUTSZ=%d",outputsz); ++ String options = "-D PACKED_CLASSIFIER"; ++ options = options + format(" -D NODE_SIZE=%d",NODE_SIZE); ++ options = options + format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width); ++ options = options + format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height); ++ options = options + format(" -D STUMP_BASED=%d",gcascade->is_stump_based); ++ options = options + format(" -D LSx=%d",localThreads[0]); ++ options = options + format(" -D LSy=%d",localThreads[1]); ++ options = options + format(" -D SPLITNODE=%d",splitnode); ++ options = options + format(" -D SPLITSTAGE=%d",splitstage); ++ options = options + format(" -D OUTPUTSZ=%d",outputsz); + + // init candiate global count by 0 + int pattern = 0; + openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL)); + // execute face detector + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str()); + //read candidate buffer back and put it into host list + openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); + assert(candidate[0]is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0"; + + openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options); - for(int i = 0; i < outputsz; i++) - if(candidate[4 * i + 2] != 0) - allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], - candidate[4 * i + 2], candidate[4 * i + 3])); + openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz ); + + for(int i = 0; i < outputsz; i++) + if(candidate[4 * i + 2] != 0) + allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1], + candidate[4 * i + 2], candidate[4 * i + 3])); + } free(scaleinfo); free(candidate); diff --cc modules/ocl/src/imgproc.cpp index ed39868,3539dfa..96bdb91 --- a/modules/ocl/src/imgproc.cpp +++ b/modules/ocl/src/imgproc.cpp @@@ -99,79 -98,85 +99,85 @@@ namespace c ///////////////////////////////////////////////////////////////////////////////////// // threshold - typedef void (*gpuThresh_t)(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type); - - static void threshold_8u(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type) + static std::vector scalarToVector(const cv::Scalar & sc, int depth, int ocn, int cn) { - uchar thresh_uchar = cvFloor(thresh); - uchar max_val = cvRound(maxVal); + CV_Assert(ocn == cn || (ocn == 4 && cn == 3)); - size_t cols = (dst.cols + (dst.offset % 16) + 15) / 16; - size_t bSizeX = 16, bSizeY = 16; - size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX; - size_t gSizeY = dst.rows; - size_t globalThreads[3] = {gSizeX, gSizeY, 1}; - size_t localThreads[3] = {bSizeX, bSizeY, 1}; + static const int sizeMap[] = { sizeof(uchar), sizeof(char), sizeof(ushort), + sizeof(short), sizeof(int), sizeof(float), sizeof(double) }; - std::vector< std::pair > args; - args.push_back( std::make_pair(sizeof(cl_mem), &src.data)); - args.push_back( std::make_pair(sizeof(cl_mem), &dst.data)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.offset)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&src.step)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step)); - args.push_back( std::make_pair(sizeof(cl_uchar), (void *)&thresh_uchar)); - args.push_back( std::make_pair(sizeof(cl_uchar), (void *)&max_val)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&type)); - openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth()); + int elemSize1 = sizeMap[depth]; + int bufSize = elemSize1 * ocn; + std::vector _buf(bufSize); + uchar * buf = &_buf[0]; + scalarToRawData(sc, buf, CV_MAKE_TYPE(depth, cn)); + memset(buf + elemSize1 * cn, 0, (ocn - cn) * elemSize1); + + return _buf; } - static void threshold_32f(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type) + static void threshold_runner(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType) { - float thresh_f = thresh; - float max_val = maxVal; - int dst_offset = (dst.offset >> 2); - int dst_step = (dst.step >> 2); - int src_offset = (src.offset >> 2); - int src_step = (src.step >> 2); - - size_t cols = (dst.cols + (dst_offset & 3) + 3) / 4; - size_t bSizeX = 16, bSizeY = 16; - size_t gSizeX = cols % bSizeX == 0 ? cols : (cols + bSizeX - 1) / bSizeX * bSizeX; - size_t gSizeY = dst.rows; - size_t globalThreads[3] = {gSizeX, gSizeY, 1}; - size_t localThreads[3] = {bSizeX, bSizeY, 1}; + bool ival = src.depth() < CV_32F; + int cn = src.channels(), vecSize = 4, depth = src.depth(); + std::vector thresholdValue = scalarToVector(cv::Scalar::all(ival ? cvFloor(thresh) : thresh), dst.depth(), + dst.oclchannels(), dst.channels()); + std::vector maxValue = scalarToVector(cv::Scalar::all(maxVal), dst.depth(), dst.oclchannels(), dst.channels()); + + const char * const thresholdMap[] = { "THRESH_BINARY", "THRESH_BINARY_INV", "THRESH_TRUNC", + "THRESH_TOZERO", "THRESH_TOZERO_INV" }; + const char * const channelMap[] = { "", "", "2", "4", "4" }; + const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; + std::string buildOptions = format("-D T=%s%s -D %s", typeMap[depth], channelMap[cn], thresholdMap[thresholdType]); + + int elemSize = src.elemSize(); + int src_step = src.step / elemSize, src_offset = src.offset / elemSize; + int dst_step = dst.step / elemSize, dst_offset = dst.offset / elemSize; - vector< pair > args; - args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src_offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&src_step)); - args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst_offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst_step)); - args.push_back( make_pair(thresholdValue.size(), (void *)&thresholdValue[0])); - args.push_back( make_pair(maxValue.size(), (void *)&maxValue[0])); + std::vector< std::pair > args; - args.push_back( std::make_pair(sizeof(cl_mem), &src.data)); - args.push_back( std::make_pair(sizeof(cl_mem), &dst.data)); ++ args.push_back( std::make_pair(sizeof(cl_mem), (void *)&src.data)); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_offset)); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&src_step)); ++ args.push_back( std::make_pair(sizeof(cl_mem), (void *)&dst.data)); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_offset)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols)); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst_step)); - args.push_back( std::make_pair(sizeof(cl_float), (void *)&thresh_f)); - args.push_back( std::make_pair(sizeof(cl_float), (void *)&max_val)); - args.push_back( std::make_pair(sizeof(cl_int), (void *)&type)); ++ args.push_back( std::make_pair(thresholdValue.size(), (void *)&thresholdValue[0])); ++ args.push_back( std::make_pair(maxValue.size(), (void *)&maxValue[0])); + + int max_index = dst.cols, cols = dst.cols; + if (cn == 1 && vecSize > 1) + { + CV_Assert(((vecSize - 1) & vecSize) == 0 && vecSize <= 16); + cols = divUp(cols, vecSize); + buildOptions += format(" -D VECTORIZED -D VT=%s%d -D VLOADN=vload%d -D VECSIZE=%d -D VSTOREN=vstore%d", + typeMap[depth], vecSize, vecSize, vecSize, vecSize); + + int vecSizeBytes = vecSize * dst.elemSize1(); + if ((dst.offset % dst.step) % vecSizeBytes == 0 && dst.step % vecSizeBytes == 0) + buildOptions += " -D DST_ALIGNED"; + if ((src.offset % src.step) % vecSizeBytes == 0 && src.step % vecSizeBytes == 0) + buildOptions += " -D SRC_ALIGNED"; + - args.push_back( make_pair(sizeof(cl_int), (void *)&max_index)); ++ args.push_back( std::make_pair(sizeof(cl_int), (void *)&max_index)); + } - openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, src.oclchannels(), src.depth()); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back( make_pair(sizeof(cl_int), (void *)&cols)); ++ args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows)); ++ args.push_back( std::make_pair(sizeof(cl_int), (void *)&cols)); + + size_t localThreads[3] = { 16, 16, 1 }; + size_t globalThreads[3] = { cols, dst.rows, 1 }; + + openCLExecuteKernel(src.clCxt, &imgproc_threshold, "threshold", globalThreads, localThreads, args, + -1, -1, buildOptions.c_str()); } - // threshold: support 8UC1 and 32FC1 data type and five threshold type - double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int type) + double threshold(const oclMat &src, oclMat &dst, double thresh, double maxVal, int thresholdType) { - //TODO: These limitations shall be removed later. - CV_Assert(src.type() == CV_8UC1 || src.type() == CV_32FC1); - CV_Assert(type == THRESH_BINARY || type == THRESH_BINARY_INV || type == THRESH_TRUNC - || type == THRESH_TOZERO || type == THRESH_TOZERO_INV ); + CV_Assert(thresholdType == THRESH_BINARY || thresholdType == THRESH_BINARY_INV || thresholdType == THRESH_TRUNC + || thresholdType == THRESH_TOZERO || thresholdType == THRESH_TOZERO_INV); - static const gpuThresh_t gpuThresh_callers[2] = {threshold_8u, threshold_32f}; - - dst.create( src.size(), src.type() ); - gpuThresh_callers[(src.type() == CV_32FC1)](src, dst, thresh, maxVal, type); + dst.create(src.size(), src.type()); + threshold_runner(src, dst, thresh, maxVal, thresholdType); return thresh; } @@@ -891,8 -895,60 +897,60 @@@ if (ksize > 0) { - Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); - Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); + Context* clCxt = Context::getContext(); + if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 && + src.cols % 8 == 0 && src.rows % 8 == 0 && + ksize==3 && + (borderType ==cv::BORDER_REFLECT || + borderType == cv::BORDER_REPLICATE || + borderType ==cv::BORDER_REFLECT101 || + borderType ==cv::BORDER_WRAP)) + { + Dx.create(src.size(), CV_32FC1); + Dy.create(src.size(), CV_32FC1); + + const unsigned int block_x = 8; + const unsigned int block_y = 8; + + unsigned int src_pitch = src.step; + unsigned int dst_pitch = Dx.cols; + + float _scale = scale; + + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data )); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch )); + args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch )); + args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale )); + size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1}; + - string option = "-D BLK_X=8 -D BLK_Y=8"; ++ String option = "-D BLK_X=8 -D BLK_Y=8"; + switch(borderType) + { + case cv::BORDER_REPLICATE: - option += " -D BORDER_REPLICATE"; ++ option = option + " -D BORDER_REPLICATE"; + break; + case cv::BORDER_REFLECT: - option += " -D BORDER_REFLECT"; ++ option = option + " -D BORDER_REFLECT"; + break; + case cv::BORDER_REFLECT101: - option += " -D BORDER_REFLECT101"; ++ option = option + " -D BORDER_REFLECT101"; + break; + case cv::BORDER_WRAP: - option += " -D BORDER_WRAP"; ++ option = option + " -D BORDER_WRAP"; + break; + } + openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() ); + } + else + { + Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType); + Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType); + } } else { @@@ -937,23 -993,24 +995,24 @@@ size_t gt[3] = { globalSizeX, globalSizeY, 1 }; size_t lt[3] = { blockSizeX, blockSizeY, 1 }; - vector > args; - args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dx.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&Dy.data)); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.offset )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholerows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&Dx.wholecols )); - args.push_back( make_pair(sizeof(cl_int), (void *)&Dx.step)); - args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.offset )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholerows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&Dy.wholecols )); - args.push_back( make_pair(sizeof(cl_int), (void *)&Dy.step)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); - args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step)); - args.push_back( make_pair( sizeof(cl_float) , (void *)&k)); + std::vector > args; + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data )); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data)); + args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.offset )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.wholerows )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.wholecols )); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&Dx.step)); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.offset )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.wholerows )); + args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.wholecols )); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&Dy.step)); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.offset)); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.rows)); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.cols)); + args.push_back( std::make_pair(sizeof(cl_int), (void *)&dst.step)); + args.push_back( std::make_pair( sizeof(cl_float) , (void *)&k)); + openCLExecuteKernel(dst.clCxt, source, kernelName, gt, lt, args, -1, -1, buildOptions.c_str()); } @@@ -969,7 -1026,7 +1028,7 @@@ { if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { - CV_Error(Error::OpenCLDoubleNotSupported, "Select device doesn't support double"); - CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double"); ++ CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double"); return; } @@@ -991,7 -1048,7 +1050,7 @@@ { if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F) { - CV_Error(Error::OpenCLDoubleNotSupported, "select device don't support double"); - CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double"); ++ CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double"); return; } diff --cc modules/ocl/src/kmeans.cpp index 5486aa4,58a68a7..52fe0eb --- a/modules/ocl/src/kmeans.cpp +++ b/modules/ocl/src/kmeans.cpp @@@ -160,32 -160,61 +160,61 @@@ static void generateCentersPP(const Mat } } - void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers) + void cv::ocl::distanceToCenters(oclMat &dists, oclMat &labels, const oclMat &src, const oclMat ¢ers, int distType, const oclMat &indices) { - //if(src.clCxt -> impl -> double_support == 0 && src.type() == CV_64F) - //{ - // CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double"); - // return; - //} - - Context *clCxt = src.clCxt; - int labels_step = (int)(labels.step/labels.elemSize()); + CV_Assert(src.cols*src.oclchannels() == centers.cols*centers.oclchannels()); + CV_Assert(src.depth() == CV_32F && centers.depth() == CV_32F); + bool is_label_row_major = false; + ensureSizeIsEnough(1, src.rows, CV_32FC1, dists); + if(labels.empty() || (!labels.empty() && labels.rows == src.rows && labels.cols == 1)) + { + ensureSizeIsEnough(src.rows, 1, CV_32SC1, labels); + is_label_row_major = true; + } + CV_Assert(distType == NORM_L1 || distType == NORM_L2SQR); + + std::stringstream build_opt_ss; + build_opt_ss + << (distType == NORM_L1 ? "-D L1_DIST" : "-D L2SQR_DIST") + << (indices.empty() ? "" : " -D USE_INDEX"); + + String build_opt = build_opt_ss.str(); + + const int src_step = (int)(src.oclchannels() * src.step / src.elemSize()); + const int centers_step = (int)(centers.oclchannels() * centers.step / centers.elemSize()); + + const int colsNumb = centers.cols*centers.oclchannels(); + + const int label_step = is_label_row_major ? (int)(labels.step / labels.elemSize()) : 1; String kernelname = "distanceToCenters"; - int threadNum = src.rows > 256 ? 256 : src.rows; - size_t localThreads[3] = {1, threadNum, 1}; - size_t globalThreads[3] = {1, src.rows, 1}; + + const int number_of_input = indices.empty() ? src.rows : indices.size().area(); + + const int src_offset = (int)src.offset/src.elemSize(); + const int centers_offset = (int)centers.offset/centers.elemSize(); + + size_t globalThreads[3] = {number_of_input, 1, 1}; - vector > args; - args.push_back(make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back(make_pair(sizeof(cl_mem), (void *)¢ers.data)); + std::vector > args; - args.push_back(std::make_pair(sizeof(cl_int), (void *)&labels_step)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers.rows)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)&src.data)); - args.push_back(std::make_pair(sizeof(cl_mem), (void *)&labels.data)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers.cols)); - args.push_back(std::make_pair(sizeof(cl_int), (void *)&src.rows)); + args.push_back(std::make_pair(sizeof(cl_mem), (void *)¢ers.data)); - args.push_back(std::make_pair(sizeof(cl_mem), (void*)&dists.data)); + if(!indices.empty()) + { - args.push_back(make_pair(sizeof(cl_mem), (void *)&indices.data)); ++ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&indices.data)); + } - args.push_back(make_pair(sizeof(cl_mem), (void *)&labels.data)); - args.push_back(make_pair(sizeof(cl_mem), (void *)&dists.data)); - args.push_back(make_pair(sizeof(cl_int), (void *)&colsNumb)); - args.push_back(make_pair(sizeof(cl_int), (void *)&src_step)); - args.push_back(make_pair(sizeof(cl_int), (void *)¢ers_step)); - args.push_back(make_pair(sizeof(cl_int), (void *)&label_step)); - args.push_back(make_pair(sizeof(cl_int), (void *)&number_of_input)); - args.push_back(make_pair(sizeof(cl_int), (void *)¢ers.rows)); - args.push_back(make_pair(sizeof(cl_int), (void *)&src_offset)); - args.push_back(make_pair(sizeof(cl_int), (void *)¢ers_offset)); ++ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&labels.data)); ++ args.push_back(std::make_pair(sizeof(cl_mem), (void *)&dists.data)); ++ args.push_back(std::make_pair(sizeof(cl_int), (void *)&colsNumb)); ++ args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_step)); ++ args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers_step)); ++ args.push_back(std::make_pair(sizeof(cl_int), (void *)&label_step)); ++ args.push_back(std::make_pair(sizeof(cl_int), (void *)&number_of_input)); ++ args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers.rows)); ++ args.push_back(std::make_pair(sizeof(cl_int), (void *)&src_offset)); ++ args.push_back(std::make_pair(sizeof(cl_int), (void *)¢ers_offset)); - openCLExecuteKernel(clCxt, &kmeans_kernel, kernelname, globalThreads, localThreads, args, -1, -1, NULL); + openCLExecuteKernel(Context::getContext(), &kmeans_kernel, + kernelname, globalThreads, NULL, args, -1, -1, build_opt.c_str()); } ///////////////////////////////////k - means ///////////////////////////////////////////////////////// double cv::ocl::kmeans(const oclMat &_src, int K, oclMat &_bestLabels, diff --cc modules/ocl/src/moments.cpp index 6372364,f11d381..0ba6e8c --- a/modules/ocl/src/moments.cpp +++ b/modules/ocl/src/moments.cpp @@@ -44,301 -44,344 +44,348 @@@ // //M*/ #include "precomp.hpp" + +#include "opencv2/imgproc/types_c.h" +#include "opencv2/imgproc/imgproc_c.h" + #include "opencl_kernels.hpp" + #if defined _MSC_VER + #define snprintf sprintf_s + #endif namespace cv { - namespace ocl - { - // The function calculates center of gravity and the central second order moments - static void icvCompleteMomentState( CvMoments* moments ) - { - double cx = 0, cy = 0; - double mu20, mu11, mu02; - - assert( moments != 0 ); - moments->inv_sqrt_m00 = 0; - - if( fabs(moments->m00) > DBL_EPSILON ) - { - double inv_m00 = 1. / moments->m00; - cx = moments->m10 * inv_m00; - cy = moments->m01 * inv_m00; - moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) ); - } - - // mu20 = m20 - m10*cx - mu20 = moments->m20 - moments->m10 * cx; - // mu11 = m11 - m10*cy - mu11 = moments->m11 - moments->m10 * cy; - // mu02 = m02 - m01*cy - mu02 = moments->m02 - moments->m01 * cy; - - moments->mu20 = mu20; - moments->mu11 = mu11; - moments->mu02 = mu02; - - // mu30 = m30 - cx*(3*mu20 + cx*m10) - moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10); - mu11 += mu11; - // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20 - moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20; - // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02 - moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02; - // mu03 = m03 - cy*(3*mu02 + cy*m01) - moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01); - } - - - static void icvContourMoments( CvSeq* contour, CvMoments* mom ) - { - if( contour->total ) + namespace ocl { - CvSeqReader reader; - int lpt = contour->total; - double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03; - - cvStartReadSeq( contour, &reader, 0 ); + // The function calculates center of gravity and the central second order moments + static void icvCompleteMomentState( CvMoments* moments ) + { + double cx = 0, cy = 0; + double mu20, mu11, mu02; - size_t reader_size = lpt << 1; - cv::Mat reader_mat(1,reader_size,CV_32FC1); + assert( moments != 0 ); + moments->inv_sqrt_m00 = 0; - bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; + if( fabs(moments->m00) > DBL_EPSILON ) + { + double inv_m00 = 1. / moments->m00; + cx = moments->m10 * inv_m00; + cy = moments->m01 * inv_m00; + moments->inv_sqrt_m00 = std::sqrt( fabs(inv_m00) ); + } - if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float) - { - CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); + // mu20 = m20 - m10*cx + mu20 = moments->m20 - moments->m10 * cx; + // mu11 = m11 - m10*cy + mu11 = moments->m11 - moments->m10 * cy; + // mu02 = m02 - m01*cy + mu02 = moments->m02 - moments->m01 * cy; + + moments->mu20 = mu20; + moments->mu11 = mu11; + moments->mu02 = mu02; + + // mu30 = m30 - cx*(3*mu20 + cx*m10) + moments->mu30 = moments->m30 - cx * (3 * mu20 + cx * moments->m10); + mu11 += mu11; + // mu21 = m21 - cx*(2*mu11 + cx*m01) - cy*mu20 + moments->mu21 = moments->m21 - cx * (mu11 + cx * moments->m01) - cy * mu20; + // mu12 = m12 - cy*(2*mu11 + cy*m10) - cx*mu02 + moments->mu12 = moments->m12 - cy * (mu11 + cy * moments->m10) - cx * mu02; + // mu03 = m03 - cy*(3*mu02 + cy*m01) + moments->mu03 = moments->m03 - cy * (3 * mu02 + cy * moments->m01); } - if( is_float ) + + static void icvContourMoments( CvSeq* contour, CvMoments* mom ) { - for(size_t i = 0; i < reader_size; ++i) + if( contour->total ) { - reader_mat.at(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x; - reader_mat.at(0, i) = ((CvPoint2D32f*)(reader.ptr))->y; - CV_NEXT_SEQ_ELEM( contour->elem_size, reader ); + CvSeqReader reader; + int lpt = contour->total; + double a00, a10, a01, a20, a11, a02, a30, a21, a12, a03; + + cvStartReadSeq( contour, &reader, 0 ); + + size_t reader_size = lpt << 1; + cv::Mat reader_mat(1,reader_size,CV_32FC1); + + bool is_float = CV_SEQ_ELTYPE(contour) == CV_32FC2; + + if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE) && is_float) + { + CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); + } + + if( is_float ) + { + for(size_t i = 0; i < reader_size; ++i) + { + reader_mat.at(0, i++) = ((CvPoint2D32f*)(reader.ptr))->x; + reader_mat.at(0, i) = ((CvPoint2D32f*)(reader.ptr))->y; + CV_NEXT_SEQ_ELEM( contour->elem_size, reader ); + } + } + else + { + for(size_t i = 0; i < reader_size; ++i) + { + reader_mat.at(0, i++) = ((CvPoint*)(reader.ptr))->x; + reader_mat.at(0, i) = ((CvPoint*)(reader.ptr))->y; + CV_NEXT_SEQ_ELEM( contour->elem_size, reader ); + } + } + + cv::ocl::oclMat dst_a(10, lpt, CV_64FC1); + cv::ocl::oclMat reader_oclmat(reader_mat); + int llength = std::min(lpt,128); + size_t localThreads[3] = { llength, 1, 1}; + size_t globalThreads[3] = { lpt, 1, 1}; - vector > args; - args.push_back( make_pair( sizeof(cl_int) , (void *)&contour->total )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_a.data )); ++ std::vector > args; ++ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total )); ++ args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data )); ++ args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data )); + cl_int dst_step = (cl_int)dst_a.step; - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_step )); ++ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step )); + + char builOption[128]; + snprintf(builOption, 128, "-D CV_8UC1"); + + openCLExecuteKernel(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1, builOption); + + cv::Mat dst(dst_a); + a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; + if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) + { + for (int i = 0; i < contour->total; ++i) + { + a00 += dst.at(0, i); + a10 += dst.at(1, i); + a01 += dst.at(2, i); + a20 += dst.at(3, i); + a11 += dst.at(4, i); + a02 += dst.at(5, i); + a30 += dst.at(6, i); + a21 += dst.at(7, i); + a12 += dst.at(8, i); + a03 += dst.at(9, i); + } + } + else + { + a00 = cv::sum(dst.row(0))[0]; + a10 = cv::sum(dst.row(1))[0]; + a01 = cv::sum(dst.row(2))[0]; + a20 = cv::sum(dst.row(3))[0]; + a11 = cv::sum(dst.row(4))[0]; + a02 = cv::sum(dst.row(5))[0]; + a30 = cv::sum(dst.row(6))[0]; + a21 = cv::sum(dst.row(7))[0]; + a12 = cv::sum(dst.row(8))[0]; + a03 = cv::sum(dst.row(9))[0]; + } + + double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60; + if( fabs(a00) > FLT_EPSILON ) + { + if( a00 > 0 ) + { + db1_2 = 0.5; + db1_6 = 0.16666666666666666666666666666667; + db1_12 = 0.083333333333333333333333333333333; + db1_24 = 0.041666666666666666666666666666667; + db1_20 = 0.05; + db1_60 = 0.016666666666666666666666666666667; + } + else + { + db1_2 = -0.5; + db1_6 = -0.16666666666666666666666666666667; + db1_12 = -0.083333333333333333333333333333333; + db1_24 = -0.041666666666666666666666666666667; + db1_20 = -0.05; + db1_60 = -0.016666666666666666666666666666667; + } + + // spatial moments + mom->m00 = a00 * db1_2; + mom->m10 = a10 * db1_6; + mom->m01 = a01 * db1_6; + mom->m20 = a20 * db1_12; + mom->m11 = a11 * db1_24; + mom->m02 = a02 * db1_12; + mom->m30 = a30 * db1_20; + mom->m21 = a21 * db1_60; + mom->m12 = a12 * db1_60; + mom->m03 = a03 * db1_20; + + icvCompleteMomentState( mom ); + } } } - else + + Moments ocl_moments(oclMat& src, bool binary) //for image { - for(size_t i = 0; i < reader_size; ++i) + CV_Assert(src.oclchannels() == 1); + if(src.type() == CV_64FC1 && !Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) { - reader_mat.at(0, i++) = ((CvPoint*)(reader.ptr))->x; - reader_mat.at(0, i) = ((CvPoint*)(reader.ptr))->y; - CV_NEXT_SEQ_ELEM( contour->elem_size, reader ); + CV_Error(CV_StsUnsupportedFormat, "Moments - double is not supported by your GPU!"); } - } - cv::ocl::oclMat dst_a(10, lpt, CV_64FC1); - cv::ocl::oclMat reader_oclmat(reader_mat); - int llength = std::min(lpt,128); - size_t localThreads[3] = { llength, 1, 1}; - size_t globalThreads[3] = { lpt, 1, 1}; - std::vector > args; - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&contour->total )); - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&reader_oclmat.data )); - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_a.data )); - cl_int dst_step = (cl_int)dst_a.step; - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step )); - - openCLExecuteKernel2(dst_a.clCxt, &moments, "icvContourMoments", globalThreads, localThreads, args, -1, -1); - - cv::Mat dst(dst_a); - a00 = a10 = a01 = a20 = a11 = a02 = a30 = a21 = a12 = a03 = 0.0; - if (!cv::ocl::Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) - { - for (int i = 0; i < contour->total; ++i) + if(binary) { - a00 += dst.at(0, i); - a10 += dst.at(1, i); - a01 += dst.at(2, i); - a20 += dst.at(3, i); - a11 += dst.at(4, i); - a02 += dst.at(5, i); - a30 += dst.at(6, i); - a21 += dst.at(7, i); - a12 += dst.at(8, i); - a03 += dst.at(9, i); + oclMat mask; + if(src.type() != CV_8UC1) + { + src.convertTo(mask, CV_8UC1); + } + oclMat src8u(src.size(), CV_8UC1); + src8u.setTo(Scalar(255), mask); + src = src8u; } - } - else - { - a00 = cv::sum(dst.row(0))[0]; - a10 = cv::sum(dst.row(1))[0]; - a01 = cv::sum(dst.row(2))[0]; - a20 = cv::sum(dst.row(3))[0]; - a11 = cv::sum(dst.row(4))[0]; - a02 = cv::sum(dst.row(5))[0]; - a30 = cv::sum(dst.row(6))[0]; - a21 = cv::sum(dst.row(7))[0]; - a12 = cv::sum(dst.row(8))[0]; - a03 = cv::sum(dst.row(9))[0]; - } + const int TILE_SIZE = 256; - double db1_2, db1_6, db1_12, db1_24, db1_20, db1_60; - if( fabs(a00) > FLT_EPSILON ) - { - if( a00 > 0 ) + CvMoments mom; + memset(&mom, 0, sizeof(mom)); + + cv::Size size = src.size(); + int blockx, blocky; + blockx = (size.width + TILE_SIZE - 1)/TILE_SIZE; + blocky = (size.height + TILE_SIZE - 1)/TILE_SIZE; + + oclMat dst_m; + int tile_height = TILE_SIZE; + + size_t localThreads[3] = {1, tile_height, 1}; + size_t globalThreads[3] = {blockx, size.height, 1}; + + if(Context::getContext()->supportsFeature(FEATURE_CL_DOUBLE)) { - db1_2 = 0.5; - db1_6 = 0.16666666666666666666666666666667; - db1_12 = 0.083333333333333333333333333333333; - db1_24 = 0.041666666666666666666666666666667; - db1_20 = 0.05; - db1_60 = 0.016666666666666666666666666666667; + dst_m.create(blocky * 10, blockx, CV_64FC1); + }else + { + dst_m.create(blocky * 10, blockx, CV_32FC1); } + + int src_step = (int)(src.step/src.elemSize()); + int dstm_step = (int)(dst_m.step/dst_m.elemSize()); + - vector > args,args_sum; - args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&src_step )); - args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dst_m.cols )); - args.push_back( make_pair( sizeof(cl_int) , (void *)&dstm_step )); ++ std::vector > args,args_sum; ++ args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); ++ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows )); ++ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols )); ++ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step )); ++ args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); ++ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.cols )); ++ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dstm_step )); + + int binary_; + if(binary) + binary_ = 1; else + binary_ = 0; - args.push_back( make_pair( sizeof(cl_int) , (void *)&binary_)); ++ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary_)); + + char builOption[128]; + if(binary || src.type() == CV_8UC1) + { + snprintf(builOption, 128, "-D CV_8UC1"); + }else if(src.type() == CV_16UC1) { - db1_2 = -0.5; - db1_6 = -0.16666666666666666666666666666667; - db1_12 = -0.083333333333333333333333333333333; - db1_24 = -0.041666666666666666666666666666667; - db1_20 = -0.05; - db1_60 = -0.016666666666666666666666666666667; + snprintf(builOption, 128, "-D CV_16UC1"); + }else if(src.type() == CV_16SC1) + { + snprintf(builOption, 128, "-D CV_16SC1"); + }else if(src.type() == CV_32FC1) + { + snprintf(builOption, 128, "-D CV_32FC1"); + }else if(src.type() == CV_64FC1) + { + snprintf(builOption, 128, "-D CV_64FC1"); + }else + { + CV_Error( CV_StsUnsupportedFormat, "" ); } - // spatial moments - mom->m00 = a00 * db1_2; - mom->m10 = a10 * db1_6; - mom->m01 = a01 * db1_6; - mom->m20 = a20 * db1_12; - mom->m11 = a11 * db1_24; - mom->m02 = a02 * db1_12; - mom->m30 = a30 * db1_20; - mom->m21 = a21 * db1_60; - mom->m12 = a12 * db1_60; - mom->m03 = a03 * db1_20; - - icvCompleteMomentState( mom ); - } - } - } - - static void ocl_cvMoments( const void* array, CvMoments* mom, int binary ) - { - const int TILE_SIZE = 256; - int type, depth, cn, coi = 0; - CvMat stub, *mat = (CvMat*)array; - CvContour contourHeader; - CvSeq* contour = 0; - CvSeqBlock block; - if( CV_IS_SEQ( array )) - { - contour = (CvSeq*)array; - if( !CV_IS_SEQ_POINT_SET( contour )) - CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" ); - } + openCLExecuteKernel(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, -1, builOption); - if( !mom ) - CV_Error( CV_StsNullPtr, "" ); + Mat tmp(dst_m); + tmp.convertTo(tmp, CV_64FC1); - memset( mom, 0, sizeof(*mom)); + double tmp_m[10] = {0}; - if( !contour ) - { + for(int j = 0; j < tmp.rows; j += 10) + { + for(int i = 0; i < tmp.cols; i++) + { + tmp_m[0] += tmp.at(j, i); + tmp_m[1] += tmp.at(j + 1, i); + tmp_m[2] += tmp.at(j + 2, i); + tmp_m[3] += tmp.at(j + 3, i); + tmp_m[4] += tmp.at(j + 4, i); + tmp_m[5] += tmp.at(j + 5, i); + tmp_m[6] += tmp.at(j + 6, i); + tmp_m[7] += tmp.at(j + 7, i); + tmp_m[8] += tmp.at(j + 8, i); + tmp_m[9] += tmp.at(j + 9, i); + } + } - mat = cvGetMat( mat, &stub, &coi ); - type = CV_MAT_TYPE( mat->type ); + mom.m00 = tmp_m[0]; + mom.m10 = tmp_m[1]; + mom.m01 = tmp_m[2]; + mom.m20 = tmp_m[3]; + mom.m11 = tmp_m[4]; + mom.m02 = tmp_m[5]; + mom.m30 = tmp_m[6]; + mom.m21 = tmp_m[7]; + mom.m12 = tmp_m[8]; + mom.m03 = tmp_m[9]; + icvCompleteMomentState( &mom ); + return mom; + } - if( type == CV_32SC2 || type == CV_32FC2 ) + Moments ocl_moments(InputArray _contour) //for contour { - contour = cvPointSeqFromMat( - CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED, - mat, &contourHeader, &block ); - } - } - if( contour ) - { - icvContourMoments( contour, mom ); - return; - } + CvMoments mom; + memset(&mom, 0, sizeof(mom)); - type = CV_MAT_TYPE( mat->type ); - depth = CV_MAT_DEPTH( type ); - cn = CV_MAT_CN( type ); - - cv::Size size = cvGetMatSize( mat ); - if( cn > 1 && coi == 0 ) - CV_Error( CV_StsBadArg, "Invalid image type" ); - - if( size.width <= 0 || size.height <= 0 ) - return; - - cv::Mat src0 = cv::cvarrToMat(mat); - cv::ocl::oclMat src(src0); - cv::Size tileSize; - int blockx,blocky; - if(size.width%TILE_SIZE == 0) - blockx = size.width/TILE_SIZE; - else - blockx = size.width/TILE_SIZE + 1; - if(size.height%TILE_SIZE == 0) - blocky = size.height/TILE_SIZE; - else - blocky = size.height/TILE_SIZE + 1; - oclMat dst_m(blocky * 10, blockx, CV_64FC1); - oclMat sum(1, 10, CV_64FC1); - int tile_width = std::min(size.width,TILE_SIZE); - int tile_height = std::min(size.height,TILE_SIZE); - size_t localThreads[3] = { tile_height, 1, 1}; - size_t globalThreads[3] = { size.height, blockx, 1}; - std::vector > args,args_sum; - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.step )); - args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.cols )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.step )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&blocky )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&depth )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cn )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&coi )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&binary )); - args.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); - openCLExecuteKernel2(Context::getContext(), &moments, "CvMoments", globalThreads, localThreads, args, -1, depth); - - size_t localThreadss[3] = { 128, 1, 1}; - size_t globalThreadss[3] = { 128, 1, 1}; - args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows )); - args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols )); - args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_height )); - args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&tile_width )); - args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&TILE_SIZE )); - args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&sum.data )); - args_sum.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_m.data )); - args_sum.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_m.step )); - openCLExecuteKernel2(Context::getContext(), &moments, "dst_sum", globalThreadss, localThreadss, args_sum, -1, -1); - - Mat dstsum(sum); - mom->m00 = dstsum.at(0, 0); - mom->m10 = dstsum.at(0, 1); - mom->m01 = dstsum.at(0, 2); - mom->m20 = dstsum.at(0, 3); - mom->m11 = dstsum.at(0, 4); - mom->m02 = dstsum.at(0, 5); - mom->m30 = dstsum.at(0, 6); - mom->m21 = dstsum.at(0, 7); - mom->m12 = dstsum.at(0, 8); - mom->m03 = dstsum.at(0, 9); - - icvCompleteMomentState( mom ); - } + Mat arr = _contour.getMat(); + CvMat c_array = arr; + const void* array = &c_array; - Moments ocl_moments( InputArray _array, bool binaryImage ) - { - CvMoments om; - Mat arr = _array.getMat(); - CvMat c_array = arr; - ocl_cvMoments(&c_array, &om, binaryImage); - return om; - } + CvSeq* contour = 0; + if( CV_IS_SEQ( array )) + { + contour = (CvSeq*)(array); + if( !CV_IS_SEQ_POINT_SET( contour )) + CV_Error( CV_StsBadArg, "The passed sequence is not a valid contour" ); + } - } + int type, coi = 0; + + CvMat stub, *mat = (CvMat*)(array); + CvContour contourHeader; + CvSeqBlock block; + + if( !contour ) + { + mat = cvGetMat( mat, &stub, &coi ); + type = CV_MAT_TYPE( mat->type ); + + if( type == CV_32SC2 || type == CV_32FC2 ) + { + contour = cvPointSeqFromMat( + CV_SEQ_KIND_CURVE | CV_SEQ_FLAG_CLOSED, + mat, &contourHeader, &block ); + } + } + + CV_Assert(contour); + icvContourMoments(contour, &mom); + return mom; + } + } -} +} diff --cc modules/ocl/src/safe_call.hpp index 6bc73ef,f772e1b..bd409c8 --- a/modules/ocl/src/safe_call.hpp +++ b/modules/ocl/src/safe_call.hpp @@@ -65,8 -66,8 +65,8 @@@ namespace c static inline void ___openCLSafeCall(int err, const char *file, const int line, const char *func = "") { - if( CL_SUCCESS != err) + if (CL_SUCCESS != err) - cv::ocl::error(getOpenCLErrorString(err), file, line, func); + cv::error(Error::OpenCLApiCallError, getOpenCLErrorString(err), func, file, line); } } } diff --cc modules/ocl/src/split_merge.cpp index 990c91c,60a27a5..073a7a7 --- a/modules/ocl/src/split_merge.cpp +++ b/modules/ocl/src/split_merge.cpp @@@ -148,73 -149,112 +148,112 @@@ namespace c mat_dst.create(size, CV_MAKETYPE(depth, total_channels)); merge_vector_run(mat_src, n, mat_dst); } - static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst) + static void split_vector_run(const oclMat &src, oclMat *dst) { - if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F) + if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F) { - CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double"); + CV_Error(Error::OpenCLDoubleNotSupported, "Selected device doesn't support double"); return; } - Context *clCxt = mat_src.clCxt; - int channels = mat_src.oclchannels(); - int depth = mat_src.depth(); + Context *clCtx = src.clCxt; + int channels = src.channels(); + int depth = src.depth(); + depth = (depth == CV_8S) ? CV_8U : depth; + depth = (depth == CV_16S) ? CV_16U : depth; - string kernelName = "split_vector"; + String kernelName = "split_vector"; - int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0}, - {4, 4, 2, 2, 1, 1, 1}, - {4, 4, 2, 2 , 1, 1, 1}, - {4, 4, 2, 2, 1, 1, 1} - }; - - size_t vector_length = vector_lengths[channels - 1][mat_dst[0].depth()]; - - int max_offset_cols = 0; - for(int i = 0; i < channels; i++) - { - int offset_cols = (mat_dst[i].offset / mat_dst[i].elemSize()) & (vector_length - 1); - if(max_offset_cols < offset_cols) - max_offset_cols = offset_cols; - } - - int cols = vector_length == 1 ? divUp(mat_src.cols, vector_length) - : divUp(mat_src.cols + max_offset_cols, vector_length); - - size_t localThreads[3] = { 64, 4, 1 }; - size_t globalThreads[3] = { cols, mat_src.rows, 1 }; + size_t VEC_SIZE = 4; - int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize(); - vector > args; - args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&src.step)); + std::vector > args; - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_src.data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.offset)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[0].offset)); - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[1].offset)); - if(channels >= 3) ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&src.data)); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&src.step)); + int srcOffsetXBytes = src.offset % src.step; + int srcOffsetY = src.offset / src.step; + cl_int2 srcOffset = {{srcOffsetXBytes, srcOffsetY}}; - args.push_back( make_pair( sizeof(cl_int2), (void *)&srcOffset)); ++ args.push_back( std::make_pair( sizeof(cl_int2), (void *)&srcOffset)); + + bool dst0Aligned = false, dst1Aligned = false, dst2Aligned = false, dst3Aligned = false; + int alignSize = dst[0].elemSize1() * VEC_SIZE; + int alignMask = alignSize - 1; + - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[0].data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst[0].step)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[0].data)); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[0].step)); + int dst0OffsetXBytes = dst[0].offset % dst[0].step; + int dst0OffsetY = dst[0].offset / dst[0].step; + cl_int2 dst0Offset = {{dst0OffsetXBytes, dst0OffsetY}}; - args.push_back( make_pair( sizeof(cl_int2), (void *)&dst0Offset)); ++ args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst0Offset)); + if ((dst0OffsetXBytes & alignMask) == 0) + dst0Aligned = true; + - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[1].data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst[1].step)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[1].data)); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[1].step)); + int dst1OffsetXBytes = dst[1].offset % dst[1].step; + int dst1OffsetY = dst[1].offset / dst[1].step; + cl_int2 dst1Offset = {{dst1OffsetXBytes, dst1OffsetY}}; - args.push_back( make_pair( sizeof(cl_int2), (void *)&dst1Offset)); ++ args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst1Offset)); + if ((dst1OffsetXBytes & alignMask) == 0) + dst1Aligned = true; + + // DON'T MOVE VARIABLES INTO 'IF' BODY + int dst2OffsetXBytes, dst2OffsetY; + cl_int2 dst2Offset; + int dst3OffsetXBytes, dst3OffsetY; + cl_int2 dst3Offset; + if (channels >= 3) { - - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[2].offset)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[2].data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst[2].step)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[2].data)); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[2].step)); + dst2OffsetXBytes = dst[2].offset % dst[2].step; + dst2OffsetY = dst[2].offset / dst[2].step; + dst2Offset.s[0] = dst2OffsetXBytes; dst2Offset.s[1] = dst2OffsetY; - args.push_back( make_pair( sizeof(cl_int2), (void *)&dst2Offset)); ++ args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst2Offset)); + if ((dst2OffsetXBytes & alignMask) == 0) + dst2Aligned = true; } - if(channels >= 4) + + if (channels >= 4) { - args.push_back( std::make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].step)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_dst[3].offset)); - args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[3].data)); - args.push_back( make_pair( sizeof(cl_int), (void *)&dst[3].step)); ++ args.push_back( std::make_pair( sizeof(cl_mem), (void *)&dst[3].data)); ++ args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst[3].step)); + dst3OffsetXBytes = dst[3].offset % dst[3].step; + dst3OffsetY = dst[3].offset / dst[3].step; + dst3Offset.s[0] = dst3OffsetXBytes; dst3Offset.s[1] = dst3OffsetY; - args.push_back( make_pair( sizeof(cl_int2), (void *)&dst3Offset)); ++ args.push_back( std::make_pair( sizeof(cl_int2), (void *)&dst3Offset)); + if ((dst3OffsetXBytes & alignMask) == 0) + dst3Aligned = true; } - args.push_back( std::make_pair( sizeof(cl_int), (void *)&mat_src.rows)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&cols)); - args.push_back( std::make_pair( sizeof(cl_int), (void *)&dst_step1)); - - openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth); + cl_int2 size = {{ src.cols, src.rows }}; - args.push_back( make_pair( sizeof(cl_int2), (void *)&size)); ++ args.push_back( std::make_pair( sizeof(cl_int2), (void *)&size)); + - string build_options = ++ String build_options = + cv::format("-D VEC_SIZE=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d", + (int)VEC_SIZE, depth, channels); + + if (dst0Aligned) - build_options += " -D DST0_ALIGNED"; ++ build_options = build_options + " -D DST0_ALIGNED"; + if (dst1Aligned) - build_options += " -D DST1_ALIGNED"; ++ build_options = build_options + " -D DST1_ALIGNED"; + if (dst2Aligned) - build_options += " -D DST2_ALIGNED"; ++ build_options = build_options + " -D DST2_ALIGNED"; + if (dst3Aligned) - build_options += " -D DST3_ALIGNED"; ++ build_options = build_options + " -D DST3_ALIGNED"; + + const DeviceInfo& devInfo = clCtx->getDeviceInfo(); + + // TODO Workaround for issues. Need to investigate a problem. + if (channels == 2 + && devInfo.deviceType == CVCL_DEVICE_TYPE_CPU + && devInfo.platform->platformVendor.find("Intel") != std::string::npos + && (devInfo.deviceVersion.find("Build 56860") != std::string::npos + || devInfo.deviceVersion.find("Build 76921") != std::string::npos)) - build_options += " -D BYPASS_VSTORE=true"; ++ build_options = build_options + " -D BYPASS_VSTORE=true"; + + size_t globalThreads[3] = { divUp(src.cols, VEC_SIZE), src.rows, 1 }; + openCLExecuteKernel(clCtx, &split_mat, kernelName, globalThreads, NULL, args, -1, -1, build_options.c_str()); } static void split(const oclMat &mat_src, oclMat *mat_dst) { @@@ -253,9 -292,9 +291,9 @@@ void cv::ocl::split(const oclMat &src, { split_merge::split(src, dst); } -void cv::ocl::split(const oclMat &src, vector &dst) +void cv::ocl::split(const oclMat &src, std::vector &dst) { - dst.resize(src.oclchannels()); + dst.resize(src.oclchannels()); // TODO Why oclchannels? if(src.oclchannels() > 0) split_merge::split(src, &dst[0]); } diff --cc modules/ocl/test/test_kmeans.cpp index 94263d8,6539c51..d583cc9 --- a/modules/ocl/test/test_kmeans.cpp +++ b/modules/ocl/test/test_kmeans.cpp @@@ -114,13 -113,11 +113,11 @@@ OCL_TEST_P(Kmeans, Mat) for(int j = 0; j < LOOP_TIMES; j++) { kmeans(src, K, labels, - TermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 100, 0), + TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0), 1, flags, centers); - ocl::kmeans(d_src, K, d_labels, - TermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 100, 0), + TermCriteria( TermCriteria::EPS + TermCriteria::MAX_ITER, 100, 0), 1, flags, d_centers); - Mat dd_labels(d_labels); Mat dd_centers(d_centers); if(flags & KMEANS_USE_INITIAL_LABELS) diff --cc modules/ocl/test/test_moments.cpp index 7118609,788ac91..e978bb2 --- a/modules/ocl/test/test_moments.cpp +++ b/modules/ocl/test/test_moments.cpp @@@ -7,35 -8,35 +7,33 @@@ using namespace cv using namespace cv::ocl; using namespace cvtest; using namespace testing; - PARAM_TEST_CASE(MomentsTest, MatType, bool) -using namespace std; - + PARAM_TEST_CASE(MomentsTest, MatType, bool, bool) { int type; - cv::Mat mat1; + cv::Mat mat; bool test_contours; - + bool binaryImage; virtual void SetUp() { type = GET_PARAM(0); test_contours = GET_PARAM(1); - cv::Size size(10*MWIDTH, 10*MHEIGHT); - mat1 = randomMat(size, type, 5, 16, false); + cv::Size size(10 * MWIDTH, 10 * MHEIGHT); + mat = randomMat(size, type, 0, 256, false); + binaryImage = GET_PARAM(2); } - void Compare(Moments& cpu, Moments& gpu) + void Compare(Moments& cpu_moments, Moments& gpu_moments) { Mat gpu_dst, cpu_dst; - HuMoments(cpu, cpu_dst); - HuMoments(gpu, gpu_dst); - EXPECT_MAT_NEAR(gpu_dst,cpu_dst, 1e-3); + HuMoments(cpu_moments, cpu_dst); + HuMoments(gpu_moments, gpu_dst); - EXPECT_MAT_NEAR(gpu_dst, cpu_dst, .5); ++ EXPECT_MAT_NEAR(gpu_dst, cpu_dst, 1e-3); } - }; - OCL_TEST_P(MomentsTest, Mat) { - bool binaryImage = 0; - + oclMat src_d(mat); for(int j = 0; j < LOOP_TIMES; j++) { if(test_contours) @@@ -62,6 -62,5 +59,6 @@@ } } INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MomentsTest, Combine( - Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_64FC1), Values(true,false))); + Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1, CV_64FC1), Values(false, true), Values(false, true))); + #endif // HAVE_OPENCL diff --cc modules/superres/perf/perf_superres_ocl.cpp index 67bcf8c,9a8fab4..04a3f7e --- a/modules/superres/perf/perf_superres_ocl.cpp +++ b/modules/superres/perf/perf_superres_ocl.cpp @@@ -42,9 -42,9 +42,9 @@@ #include "perf_precomp.hpp" - #ifdef HAVE_OPENCL + #ifdef HAVE_OPENCV_OCL -#include "opencv2/ocl/ocl.hpp" +#include "opencv2/ocl.hpp" using namespace std; using namespace testing; using namespace perf; diff --cc samples/gpu/CMakeLists.txt index 2591d32,732a917..64c25fc --- a/samples/gpu/CMakeLists.txt +++ b/samples/gpu/CMakeLists.txt @@@ -56,11 -48,8 +56,11 @@@ if(BUILD_EXAMPLES AND OCV_DEPENDENCIES_ if(HAVE_opencv_nonfree) target_link_libraries(${the_target} opencv_nonfree) endif() + if(HAVE_opencv_cudacodec) + target_link_libraries(${the_target} opencv_cudacodec) + endif() - if(HAVE_OPENCL) + if(HAVE_opencv_ocl) target_link_libraries(${the_target} opencv_ocl) endif()