From: Alexey Spizhevoy Date: Fri, 26 Nov 2010 07:50:11 +0000 (+0000) Subject: added support of buffers into gpu::minMaxLoc, reduced memory requirements, refactored X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~8305 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=bdaad9e1fead396fb73bae409fc68f2266c6bd9a;p=platform%2Fupstream%2Fopencv.git added support of buffers into gpu::minMaxLoc, reduced memory requirements, refactored --- diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index 0edd50a..0d1e948 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -430,6 +430,11 @@ namespace cv //! finds global minimum and maximum array elements and returns their values with locations CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0); + //! finds global minimum and maximum array elements and returns their values with locations + CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, + GpuMat& valbuf, GpuMat& locbuf); + + //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i)) //! destination array will have the depth type as lut and the same channels number as source //! supports CV_8UC1, CV_8UC3 types diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index e9cc315..9d0fa75 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -68,6 +68,7 @@ Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); } void cv::gpu::minMax(const GpuMat&, double*, double*) { throw_nogpu(); } void cv::gpu::minMax(const GpuMat&, double*, double*, GpuMat&) { throw_nogpu(); } void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); } +void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*, GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); } void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); } void cv::gpu::log(const GpuMat&, GpuMat&) { throw_nogpu(); } @@ -514,8 +515,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& { using namespace mathfunc::minmax; - double maxVal_; - if (!maxVal) maxVal = &maxVal_; + double minVal_; if (!minVal) minVal = &minVal_; + double maxVal_; if (!maxVal) maxVal = &maxVal_; GpuMat src_ = src.reshape(1); @@ -561,53 +562,75 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc { + void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, + int& b2cols, int& b2rows); + template void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, - int* minlocx, int* minlocy, int* maxlocx, int* maxlocy); + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); + + template + void min_max_loc_caller_2steps(const DevMem2D src, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf); }}}} void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc) +{ + GpuMat valbuf, locbuf; + minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, valbuf, locbuf); +} + +void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, + GpuMat& valbuf, GpuMat& locbuf) { using namespace mathfunc::minmaxloc; - CV_Assert(src.channels() == 1); - double maxVal_; - if (!maxVal) maxVal = &maxVal_; + double minVal_; if (!minVal) minVal = &minVal_; + double maxVal_; if (!maxVal) maxVal = &maxVal_; + int minLoc_[2]; + int maxLoc_[2]; - cv::Point minLoc_; - if (!minLoc) minLoc = &minLoc_; + Size valbuf_size, locbuf_size; + get_buf_size_required(src.elemSize(), valbuf_size.width, valbuf_size.height, + locbuf_size.width, locbuf_size.height); + valbuf.create(valbuf_size, CV_8U); + locbuf.create(locbuf_size, CV_8U); - cv::Point maxLoc_; - if (!maxLoc) maxLoc = &maxLoc_; - - switch (src.type()) + int major, minor; + getComputeCapability(getDevice(), major, minor); + + if (major >= 1 && minor >= 1) + { + switch (src.type()) + { + case CV_8U: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_8S: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_16U: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_16S: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_32S: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_32F: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_64F: min_max_loc_caller(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + default: CV_Error(CV_StsBadArg, "Unsupported type"); + } + } + else { - case CV_8U: - min_max_loc_caller(src, minVal, maxVal, &minLoc->x, &minLoc->y, &maxLoc->x, &maxLoc->y); - break; - case CV_8S: - min_max_loc_caller(src, minVal, maxVal, &minLoc->x, &minLoc->y, &maxLoc->x, &maxLoc->y); - break; - case CV_16U: - min_max_loc_caller(src, minVal, maxVal, &minLoc->x, &minLoc->y, &maxLoc->x, &maxLoc->y); - break; - case CV_16S: - min_max_loc_caller(src, minVal, maxVal, &minLoc->x, &minLoc->y, &maxLoc->x, &maxLoc->y); - break; - case CV_32S: - min_max_loc_caller(src, minVal, maxVal, &minLoc->x, &minLoc->y, &maxLoc->x, &maxLoc->y); - break; - case CV_32F: - min_max_loc_caller(src, minVal, maxVal, &minLoc->x, &minLoc->y, &maxLoc->x, &maxLoc->y); - break; - case CV_64F: - min_max_loc_caller(src, minVal, maxVal, &minLoc->x, &minLoc->y, &maxLoc->x, &maxLoc->y); - break; - default: - CV_Error(CV_StsBadArg, "Unsupported type"); + switch (src.type()) + { + case CV_8U: min_max_loc_caller_2steps(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_8S: min_max_loc_caller_2steps(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_16U: min_max_loc_caller_2steps(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_16S: min_max_loc_caller_2steps(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_32S: min_max_loc_caller_2steps(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + case CV_32F: min_max_loc_caller_2steps(src, minVal, maxVal, minLoc_, maxLoc_, valbuf, locbuf); break; + default: CV_Error(CV_StsBadArg, "Unsupported type"); + } } + + if (minLoc) { minLoc->x = minLoc_[0]; minLoc->y = minLoc_[1]; } + if (maxLoc) { maxLoc->x = maxLoc_[0]; maxLoc->y = maxLoc_[1]; } } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu index 8d5006f..b89a0d7 100644 --- a/modules/gpu/src/cuda/mathfunc.cu +++ b/modules/gpu/src/cuda/mathfunc.cu @@ -412,8 +412,6 @@ namespace cv { namespace gpu { namespace mathfunc template <> struct MinMaxTypeTraits { typedef float best_type; }; template <> struct MinMaxTypeTraits { typedef double best_type; }; - // Available optimization operations - enum { OP_MIN, OP_MAX }; namespace minmax { @@ -466,7 +464,7 @@ namespace cv { namespace gpu { namespace mathfunc template - __global__ void min_max_kernel(int cols, int rows, const PtrStep src, T* minval, T* maxval) + __global__ void min_max_kernel(const DevMem2D src, T* minval, T* maxval) { typedef typename MinMaxTypeTraits::best_type best_type; __shared__ best_type sminval[nthreads]; @@ -479,10 +477,10 @@ namespace cv { namespace gpu { namespace mathfunc T val; T mymin = numeric_limits_gpu::max(); T mymax = numeric_limits_gpu::min(); - for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < rows; ++y) + for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); - for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < cols; ++x) + for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) { val = ptr[x0 + x * blockDim.x]; mymin = min(mymin, val); @@ -509,8 +507,6 @@ namespace cv { namespace gpu { namespace mathfunc if (nthreads >= 2) merge(tid, 1, sminval, smaxval); } - __syncthreads(); - if (tid == 0) { minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; @@ -525,9 +521,9 @@ namespace cv { namespace gpu { namespace mathfunc __threadfence(); if (atomicInc(&blocks_finished, gridDim.x * gridDim.y) == gridDim.x * gridDim.y - 1) { - mymin = numeric_limits_gpu::max(); - mymax = numeric_limits_gpu::min(); - for (unsigned int i = 0; i < gridDim.x * gridDim.y; ++i) + mymin = minval[0]; + mymax = maxval[0]; + for (unsigned int i = 1; i < gridDim.x * gridDim.y; ++i) { mymin = min(mymin, minval[i]); mymax = max(mymax, maxval[i]); @@ -552,7 +548,7 @@ namespace cv { namespace gpu { namespace mathfunc T* maxval_buf = (T*)buf.ptr(1); cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); - min_max_kernel<256, T><<>>(src.cols, src.rows, src, minval_buf, maxval_buf); + min_max_kernel<256, T><<>>(src, minval_buf, maxval_buf); cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; @@ -576,9 +572,9 @@ namespace cv { namespace gpu { namespace mathfunc __global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size) { T val; - T mymin = numeric_limits_gpu::max(); - T mymax = numeric_limits_gpu::min(); - for (unsigned int i = 0; i < size; ++i) + T mymin = minval[0]; + T mymax = maxval[0]; + for (unsigned int i = 1; i < size; ++i) { val = minval[i]; if (val < mymin) mymin = val; val = maxval[i]; if (val > mymax) mymax = val; @@ -599,7 +595,7 @@ namespace cv { namespace gpu { namespace mathfunc T* maxval_buf = (T*)buf.ptr(1); cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); - min_max_kernel<256, T><<>>(src.cols, src.rows, src, minval_buf, maxval_buf); + min_max_kernel<256, T><<>>(src, minval_buf, maxval_buf); min_max_kernel_2ndstep<<<1, 1>>>(minval_buf, maxval_buf, grid.x * grid.y); cudaSafeCall(cudaThreadSynchronize()); @@ -622,220 +618,253 @@ namespace cv { namespace gpu { namespace mathfunc namespace minmaxloc { - template struct OptLoc {}; - - template - struct OptLoc + __constant__ int ctwidth; + __constant__ int ctheight; + + static const unsigned int czero = 0; + + // Global counter of blocks finished its work + __device__ unsigned int blocks_finished; + + + // Estimates good thread configuration + // - threads variable satisfies to threads.x * threads.y == 256 + void estimate_thread_cfg(dim3& threads, dim3& grid) { - static __device__ void call(unsigned int tid, unsigned int offset, volatile T* optval, volatile unsigned int* optloc) - { - T val = optval[tid + offset]; - if (val < optval[tid]) - { - optval[tid] = val; - optloc[tid] = optloc[tid + offset]; - } - } - }; + threads = dim3(64, 4); + grid = dim3(6, 5); + } + + + // Returns required buffer sizes + void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, + int& b2cols, int& b2rows) + { + dim3 threads, grid; + estimate_thread_cfg(threads, grid); + b1cols = grid.x * grid.y * elem_size; // For values + b1rows = 2; + b2cols = grid.x * grid.y * sizeof(int); // For locations + b2rows = 2; + } + + + // Estimates device constants which are used in the kernels using specified thread configuration + void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid) + { + int twidth = divUp(divUp(cols, grid.x), threads.x); + int theight = divUp(divUp(rows, grid.y), threads.y); + cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth))); + cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); + } + template - struct OptLoc + __device__ void merge(unsigned int tid, unsigned int offset, volatile T* minval, volatile T* maxval, + volatile unsigned int* minloc, volatile unsigned int* maxloc) { - static __device__ void call(unsigned int tid, unsigned int offset, volatile T* optval, volatile unsigned int* optloc) + T val = minval[tid + offset]; + if (val < minval[tid]) { - T val = optval[tid + offset]; - if (val > optval[tid]) - { - optval[tid] = val; - optloc[tid] = optloc[tid + offset]; - } + minval[tid] = val; + minloc[tid] = minloc[tid + offset]; } - }; + val = maxval[tid + offset]; + if (val > maxval[tid]) + { + maxval[tid] = val; + maxloc[tid] = maxloc[tid + offset]; + } + } + - template - __global__ void opt_loc_init_kernel(int cols, int rows, const PtrStep src, PtrStep optval, PtrStep optloc) + template + __global__ void min_max_loc_kernel(const DevMem2D src, T* minval, T* maxval, + unsigned int* minloc, unsigned int* maxloc) { typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type soptval[nthreads]; - __shared__ unsigned int soptloc[nthreads]; + __shared__ best_type sminval[nthreads]; + __shared__ best_type smaxval[nthreads]; + __shared__ unsigned int sminloc[nthreads]; + __shared__ unsigned int smaxloc[nthreads]; - unsigned int x0 = blockIdx.x * blockDim.x; - unsigned int y0 = blockIdx.y * blockDim.y; + unsigned int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x; + unsigned int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; - if (x0 + threadIdx.x < cols && y0 + threadIdx.y < rows) - { - soptval[tid] = ((const T*)src.ptr(y0 + threadIdx.y))[x0 + threadIdx.x]; - soptloc[tid] = (y0 + threadIdx.y) * cols + x0 + threadIdx.x; - } - else + T val = ((const T*)src.ptr(0))[0]; + T mymin = val, mymax = val; + unsigned int myminloc = 0, mymaxloc = 0; + for (unsigned int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y) { - soptval[tid] = ((const T*)src.ptr(y0))[x0]; - soptloc[tid] = y0 * cols + x0; + const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y); + for (unsigned int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x) + { + val = ptr[x0 + x * blockDim.x]; + if (val < mymin) + { + mymin = val; + myminloc = (y0 + y * blockDim.y) * src.cols + x0 + x * blockDim.x; + } + else if (val > mymax) + { + mymax = val; + mymaxloc = (y0 + y * blockDim.y) * src.cols + x0 + x * blockDim.x; + } + } } + sminval[tid] = mymin; + smaxval[tid] = mymax; + sminloc[tid] = myminloc; + smaxloc[tid] = mymaxloc; + __syncthreads(); - if (nthreads >= 512) if (tid < 256) { OptLoc::call(tid, 256, soptval, soptloc); __syncthreads(); } - if (nthreads >= 256) if (tid < 128) { OptLoc::call(tid, 128, soptval, soptloc); __syncthreads(); } - if (nthreads >= 128) if (tid < 64) { OptLoc::call(tid, 64, soptval, soptloc); __syncthreads(); } + if (nthreads >= 512) if (tid < 256) { merge(tid, 256, sminval, smaxval, sminloc, smaxloc); __syncthreads(); } + if (nthreads >= 256) if (tid < 128) { merge(tid, 128, sminval, smaxval, sminloc, smaxloc); __syncthreads(); } + if (nthreads >= 128) if (tid < 64) { merge(tid, 64, sminval, smaxval, sminloc, smaxloc); __syncthreads(); } if (tid < 32) { - if (nthreads >= 64) OptLoc::call(tid, 32, soptval, soptloc); - if (nthreads >= 32) OptLoc::call(tid, 16, soptval, soptloc); - if (nthreads >= 16) OptLoc::call(tid, 8, soptval, soptloc); - if (nthreads >= 8) OptLoc::call(tid, 4, soptval, soptloc); - if (nthreads >= 4) OptLoc::call(tid, 2, soptval, soptloc); - if (nthreads >= 2) OptLoc::call(tid, 1, soptval, soptloc); + if (nthreads >= 64) merge(tid, 32, sminval, smaxval, sminloc, smaxloc); + if (nthreads >= 32) merge(tid, 16, sminval, smaxval, sminloc, smaxloc); + if (nthreads >= 16) merge(tid, 8, sminval, smaxval, sminloc, smaxloc); + if (nthreads >= 8) merge(tid, 4, sminval, smaxval, sminloc, smaxloc); + if (nthreads >= 4) merge(tid, 2, sminval, smaxval, sminloc, smaxloc); + if (nthreads >= 2) merge(tid, 1, sminval, smaxval, sminloc, smaxloc); } if (tid == 0) { - ((T*)optval.ptr(blockIdx.y))[blockIdx.x] = (T)soptval[0]; - ((unsigned int*)optloc.ptr(blockIdx.y))[blockIdx.x] = soptloc[0]; + minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0]; + maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; + minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; + maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; + } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 + + // Process partial results in the first thread of the last block + if ((gridDim.x > 1 || gridDim.y > 1) && tid == 0) + { + __threadfence(); + if (atomicInc(&blocks_finished, gridDim.x * gridDim.y) == gridDim.x * gridDim.y - 1) + { + mymin = minval[0]; + mymax = maxval[0]; + unsigned int imin = 0, imax = 0; + for (unsigned int i = 1; i < gridDim.x * gridDim.y; ++i) + { + val = minval[i]; if (val < mymin) { mymin = val; imin = i; } + val = maxval[i]; if (val > mymax) { mymax = val; imax = i; } + } + minval[0] = mymin; + maxval[0] = mymax; + minloc[0] = minloc[imin]; + maxloc[0] = maxloc[imax]; + } } +#endif } - template - __global__ void opt_loc_kernel(int cols, int rows, const PtrStep src, const PtrStep loc, PtrStep optval, PtrStep optloc) + + template + void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) { - typedef typename MinMaxTypeTraits::best_type best_type; - __shared__ best_type soptval[nthreads]; - __shared__ unsigned int soptloc[nthreads]; + dim3 threads, grid; + estimate_thread_cfg(threads, grid); + estimate_kernel_consts(src.cols, src.rows, threads, grid); - unsigned int x0 = blockIdx.x * blockDim.x; - unsigned int y0 = blockIdx.y * blockDim.y; - unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; + T* minval_buf = (T*)valbuf.ptr(0); + T* maxval_buf = (T*)valbuf.ptr(1); + unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0); + unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1); - if (x0 + threadIdx.x < cols && y0 + threadIdx.y < rows) - { - soptval[tid] = ((const T*)src.ptr(y0 + threadIdx.y))[x0 + threadIdx.x]; - soptloc[tid] = ((const unsigned int*)loc.ptr(y0 + threadIdx.y))[x0 + threadIdx.x]; - } - else - { - soptval[tid] = ((const T*)src.ptr(y0))[x0]; - soptloc[tid] = ((const unsigned int*)loc.ptr(y0))[x0]; - } + cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); + min_max_loc_kernel<256, T><<>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf); + cudaSafeCall(cudaThreadSynchronize()); - __syncthreads(); + T minval_, maxval_; + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + *minval = minval_; + *maxval = maxval_; - if (nthreads >= 512) if (tid < 256) { OptLoc::call(tid, 256, soptval, soptloc); __syncthreads(); } - if (nthreads >= 256) if (tid < 128) { OptLoc::call(tid, 128, soptval, soptloc); __syncthreads(); } - if (nthreads >= 128) if (tid < 64) { OptLoc::call(tid, 64, soptval, soptloc); __syncthreads(); } + unsigned int minloc_, maxloc_; + cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; + maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; + } - if (tid < 32) - { - if (nthreads >= 64) OptLoc::call(tid, 32, soptval, soptloc); - if (nthreads >= 32) OptLoc::call(tid, 16, soptval, soptloc); - if (nthreads >= 16) OptLoc::call(tid, 8, soptval, soptloc); - if (nthreads >= 8) OptLoc::call(tid, 4, soptval, soptloc); - if (nthreads >= 4) OptLoc::call(tid, 2, soptval, soptloc); - if (nthreads >= 2) OptLoc::call(tid, 1, soptval, soptloc); - } + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); - if (tid == 0) - { - ((T*)optval.ptr(blockIdx.y))[blockIdx.x] = (T)soptval[0]; - ((unsigned int*)optloc.ptr(blockIdx.y))[blockIdx.x] = soptloc[0]; + + // This kernel will be used only when compute capability is 1.0 + template + __global__ void min_max_loc_kernel_2ndstep(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int size) + { + T val; + T mymin = minval[0]; + T mymax = maxval[0]; + unsigned int imin = 0, imax = 0; + for (unsigned int i = 1; i < size; ++i) + { + val = minval[i]; if (val < mymin) { mymin = val; imin = i; } + val = maxval[i]; if (val > mymax) { mymax = val; imax = i; } } + minval[0] = mymin; + maxval[0] = mymax; + minloc[0] = minloc[imin]; + maxloc[0] = maxloc[imax]; } + template - void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval, int* minlocx, int* minlocy, - int* maxlocx, int* maxlocy) - { - dim3 threads(32, 8); - - // Allocate memory for aux. buffers - - DevMem2D minval_buf[2]; - minval_buf[0].cols = divUp(src.cols, threads.x); - minval_buf[0].rows = divUp(src.rows, threads.y); - minval_buf[1].cols = divUp(minval_buf[0].cols, threads.x); - minval_buf[1].rows = divUp(minval_buf[0].rows, threads.y); - cudaSafeCall(cudaMallocPitch(&minval_buf[0].data, &minval_buf[0].step, minval_buf[0].cols * sizeof(T), minval_buf[0].rows)); - cudaSafeCall(cudaMallocPitch(&minval_buf[1].data, &minval_buf[1].step, minval_buf[1].cols * sizeof(T), minval_buf[1].rows)); - - DevMem2D maxval_buf[2]; - maxval_buf[0].cols = divUp(src.cols, threads.x); - maxval_buf[0].rows = divUp(src.rows, threads.y); - maxval_buf[1].cols = divUp(maxval_buf[0].cols, threads.x); - maxval_buf[1].rows = divUp(maxval_buf[0].rows, threads.y); - cudaSafeCall(cudaMallocPitch(&maxval_buf[0].data, &maxval_buf[0].step, maxval_buf[0].cols * sizeof(T), maxval_buf[0].rows)); - cudaSafeCall(cudaMallocPitch(&maxval_buf[1].data, &maxval_buf[1].step, maxval_buf[1].cols * sizeof(T), maxval_buf[1].rows)); - - DevMem2D minloc_buf[2]; - minloc_buf[0].cols = divUp(src.cols, threads.x); - minloc_buf[0].rows = divUp(src.rows, threads.y); - minloc_buf[1].cols = divUp(minloc_buf[0].cols, threads.x); - minloc_buf[1].rows = divUp(minloc_buf[0].rows, threads.y); - cudaSafeCall(cudaMallocPitch(&minloc_buf[0].data, &minloc_buf[0].step, minloc_buf[0].cols * sizeof(int), minloc_buf[0].rows)); - cudaSafeCall(cudaMallocPitch(&minloc_buf[1].data, &minloc_buf[1].step, minloc_buf[1].cols * sizeof(int), minloc_buf[1].rows)); - - DevMem2D maxloc_buf[2]; - maxloc_buf[0].cols = divUp(src.cols, threads.x); - maxloc_buf[0].rows = divUp(src.rows, threads.y); - maxloc_buf[1].cols = divUp(maxloc_buf[0].cols, threads.x); - maxloc_buf[1].rows = divUp(maxloc_buf[0].rows, threads.y); - cudaSafeCall(cudaMallocPitch(&maxloc_buf[0].data, &maxloc_buf[0].step, maxloc_buf[0].cols * sizeof(int), maxloc_buf[0].rows)); - cudaSafeCall(cudaMallocPitch(&maxloc_buf[1].data, &maxloc_buf[1].step, maxloc_buf[1].cols * sizeof(int), maxloc_buf[1].rows)); - - int curbuf = 0; - dim3 cursize(src.cols, src.rows); - dim3 grid(divUp(cursize.x, threads.x), divUp(cursize.y, threads.y)); - - opt_loc_init_kernel<256, OP_MIN, T><<>>(cursize.x, cursize.y, src, minval_buf[curbuf], minloc_buf[curbuf]); - opt_loc_init_kernel<256, OP_MAX, T><<>>(cursize.x, cursize.y, src, maxval_buf[curbuf], maxloc_buf[curbuf]); - cursize = grid; - - while (cursize.x > 1 || cursize.y > 1) - { - grid.x = divUp(cursize.x, threads.x); - grid.y = divUp(cursize.y, threads.y); - opt_loc_kernel<256, OP_MIN, T><<>>(cursize.x, cursize.y, minval_buf[curbuf], minloc_buf[curbuf], - minval_buf[1 - curbuf], minloc_buf[1 - curbuf]); - opt_loc_kernel<256, OP_MAX, T><<>>(cursize.x, cursize.y, maxval_buf[curbuf], maxloc_buf[curbuf], - maxval_buf[1 - curbuf], maxloc_buf[1 - curbuf]); - curbuf = 1 - curbuf; - cursize = grid; - } + void min_max_loc_caller_2steps(const DevMem2D src, double* minval, double* maxval, + int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf) + { + dim3 threads, grid; + estimate_thread_cfg(threads, grid); + estimate_kernel_consts(src.cols, src.rows, threads, grid); - cudaSafeCall(cudaThreadSynchronize()); + T* minval_buf = (T*)valbuf.ptr(0); + T* maxval_buf = (T*)valbuf.ptr(1); + unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0); + unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1); - // Copy results from device to host + cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); + min_max_loc_kernel<256, T><<>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf); + min_max_loc_kernel_2ndstep<<<1, 1>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); + cudaSafeCall(cudaThreadSynchronize()); T minval_, maxval_; - cudaSafeCall(cudaMemcpy(&minval_, minval_buf[curbuf].ptr(0), sizeof(T), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf[curbuf].ptr(0), sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost)); *minval = minval_; *maxval = maxval_; - unsigned int minloc, maxloc; - cudaSafeCall(cudaMemcpy(&minloc, minloc_buf[curbuf].ptr(0), sizeof(int), cudaMemcpyDeviceToHost)); - cudaSafeCall(cudaMemcpy(&maxloc, maxloc_buf[curbuf].ptr(0), sizeof(int), cudaMemcpyDeviceToHost)); - *minlocy = minloc / src.cols; *minlocx = minloc - *minlocy * src.cols; - *maxlocy = maxloc / src.cols; *maxlocx = maxloc - *maxlocy * src.cols; - - // Release aux. buffers - cudaSafeCall(cudaFree(minval_buf[0].data)); - cudaSafeCall(cudaFree(minval_buf[1].data)); - cudaSafeCall(cudaFree(maxval_buf[0].data)); - cudaSafeCall(cudaFree(maxval_buf[1].data)); - cudaSafeCall(cudaFree(minloc_buf[0].data)); - cudaSafeCall(cudaFree(minloc_buf[1].data)); - cudaSafeCall(cudaFree(maxloc_buf[0].data)); - cudaSafeCall(cudaFree(maxloc_buf[1].data)); - } - - template void min_max_loc_caller(const DevMem2D, double*, double*, int*, int*, int*, int*); - template void min_max_loc_caller(const DevMem2D, double*, double*, int*, int*, int*, int*); - template void min_max_loc_caller(const DevMem2D, double*, double*, int*, int*, int*, int*); - template void min_max_loc_caller(const DevMem2D, double*, double*, int*, int*, int*, int*); - template void min_max_loc_caller(const DevMem2D, double*, double*, int*, int*, int*, int*); - template void min_max_loc_caller(const DevMem2D, double*, double*, int*, int*, int*, int*); - template void min_max_loc_caller(const DevMem2D, double*, double*, int*, int*, int*, int*); + unsigned int minloc_, maxloc_; + cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost)); + minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols; + maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols; + } + + template void min_max_loc_caller_2steps(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller_2steps(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller_2steps(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller_2steps(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller_2steps(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); + template void min_max_loc_caller_2steps(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep); } // namespace minmaxloc diff --git a/tests/gpu/src/arithm.cpp b/tests/gpu/src/arithm.cpp index 600203b..6f712ac 100644 --- a/tests/gpu/src/arithm.cpp +++ b/tests/gpu/src/arithm.cpp @@ -683,7 +683,7 @@ struct CV_GpuMinMaxTest: public CvTest int depth_end; int major, minor; cv::gpu::getComputeCapability(getDevice(), major, minor); - minor = 0; + if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F; for (int cn = 1; cn <= 4; ++cn) @@ -757,11 +757,14 @@ struct CV_GpuMinMaxLocTest: public CvTest { CV_GpuMinMaxLocTest(): CvTest("GPU-MinMaxLocTest", "minMaxLoc") {} + GpuMat valbuf, locbuf; + void run(int) { int depth_end; int major, minor; cv::gpu::getComputeCapability(getDevice(), major, minor); + if (minor >= 1) depth_end = CV_64F; else depth_end = CV_32F; for (int depth = CV_8U; depth <= depth_end; ++depth) { @@ -807,7 +810,7 @@ struct CV_GpuMinMaxLocTest: public CvTest double minVal_, maxVal_; cv::Point minLoc_, maxLoc_; - cv::gpu::minMaxLoc(cv::gpu::GpuMat(src), &minVal_, &maxVal_, &minLoc_, &maxLoc_); + cv::gpu::minMaxLoc(cv::gpu::GpuMat(src), &minVal_, &maxVal_, &minLoc_, &maxLoc_, valbuf, locbuf); CHECK(minVal == minVal_, CvTS::FAIL_INVALID_OUTPUT); CHECK(maxVal == maxVal_, CvTS::FAIL_INVALID_OUTPUT); diff --git a/tests/gpu/src/gputest_main.cpp b/tests/gpu/src/gputest_main.cpp index 4bce0e1..01c11c0 100644 --- a/tests/gpu/src/gputest_main.cpp +++ b/tests/gpu/src/gputest_main.cpp @@ -54,6 +54,7 @@ const char* blacklist[] = }; int main( int argc, char** argv ) + { return test_system.run( argc, argv, blacklist ); }