From 1d2cf0e20eb366f37662e5e8f694e35de60a5f4c Mon Sep 17 00:00:00 2001 From: Alexander Karsakov Date: Tue, 22 Jul 2014 14:54:38 +0400 Subject: [PATCH] Added nonzero_rows support --- modules/core/src/dxt.cpp | 49 +++++++-------- modules/core/src/ocl.cpp | 8 ++- modules/core/src/opencl/fft.cl | 120 +++++++++++++++++++++---------------- modules/core/test/ocl/test_dft.cpp | 32 +++++----- 4 files changed, 109 insertions(+), 100 deletions(-) diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index eaef53a..869409f 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -2034,19 +2034,19 @@ enum FftType C2C = 3 }; -static std::vector ocl_getRadixes(int cols, std::vector& radixes, std::vector& blocks, int& min_radix) +static void ocl_getRadixes(int cols, std::vector& radixes, std::vector& blocks, int& min_radix) { int factors[34]; - int nf = DFTFactorize( cols, factors ); + int nf = DFTFactorize(cols, factors); int n = 1; int factor_index = 0; min_radix = INT_MAX; // 2^n transforms - if ( (factors[factor_index] & 1) == 0 ) + if ((factors[factor_index] & 1) == 0) { - for( ; n < factors[factor_index]; ) + for( ; n < factors[factor_index];) { int radix = 2, block = 1; if (8*n <= factors[0]) @@ -2080,7 +2080,7 @@ static std::vector ocl_getRadixes(int cols, std::vector& radixes, std: } // all the other transforms - for( ; factor_index < nf; factor_index++ ) + for( ; factor_index < nf; factor_index++) { int radix = factors[factor_index], block = 1; if (radix == 3) @@ -2101,7 +2101,6 @@ static std::vector ocl_getRadixes(int cols, std::vector& radixes, std: blocks.push_back(block); min_radix = min(min_radix, block*radix); } - return radixes; } struct OCL_FftPlan @@ -2111,14 +2110,13 @@ struct OCL_FftPlan int thread_count; int dft_size; - int flags; bool status; - OCL_FftPlan(int _size, int _flags): dft_size(_size), flags(_flags), status(true) + OCL_FftPlan(int _size): dft_size(_size), status(true) { int min_radix; std::vector radixes, blocks; ocl_getRadixes(dft_size, radixes, blocks, min_radix); - thread_count = (dft_size + min_radix-1) / min_radix; + thread_count = dft_size / min_radix; if (thread_count > ocl::Device::getDefault().maxWorkGroupSize()) { @@ -2140,8 +2138,7 @@ struct OCL_FftPlan n *= radix; } - twiddles.create(1, twiddle_size, CV_32FC2); - Mat tw = twiddles.getMat(ACCESS_WRITE); + Mat tw(1, twiddle_size, CV_32FC2); float* ptr = tw.ptr(); int ptr_index = 0; @@ -2162,6 +2159,7 @@ struct OCL_FftPlan } } } + twiddles = tw.getUMat(ACCESS_READ); buildOptions = format("-D LOCAL_SIZE=%d -D kercn=%d -D RADIX_PROCESS=%s", dft_size, dft_size/thread_count, radix_processing.c_str()); @@ -2185,10 +2183,10 @@ struct OCL_FftPlan if (rows) { - globalsize[0] = thread_count; globalsize[1] = dft_size; + globalsize[0] = thread_count; globalsize[1] = src.rows; localsize[0] = thread_count; localsize[1] = 1; kernel_name = !inv ? "fft_multi_radix_rows" : "ifft_multi_radix_rows"; - if (is1d && (flags & DFT_SCALE)) + if ((is1d || inv) && (flags & DFT_SCALE)) options += " -D DFT_SCALE"; } else @@ -2200,14 +2198,9 @@ struct OCL_FftPlan options += " -D DFT_SCALE"; } - if (src.channels() == 1) - options += " -D REAL_INPUT"; - else - options += " -D COMPLEX_INPUT"; - if (dst.channels() == 1) - options += " -D REAL_OUTPUT"; - if (is1d) - options += " -D IS_1D"; + options += src.channels() == 1 ? " -D REAL_INPUT" : " -D COMPLEX_INPUT"; + options += dst.channels() == 1 ? " -D REAL_OUTPUT" : " -D COMPLEX_OUTPUT"; + options += is1d ? " -D IS_1D" : ""; if (!inv) { @@ -2216,10 +2209,10 @@ struct OCL_FftPlan } else { - if (is1d && fftType == C2R || (rows && fftType == R2R)) + if (rows && (fftType == C2R || fftType == R2R)) options += " -D NO_CONJUGATE"; if (dst.cols % 2 == 0) - options += " -D EVEN"; + options += " -D EVEN"; } ocl::Kernel k(kernel_name.c_str(), ocl::core::fft_oclsrc, options); @@ -2240,7 +2233,7 @@ public: return planCache; } - OCL_FftPlan* getFftPlan(int dft_size, int flags) + OCL_FftPlan* getFftPlan(int dft_size) { for (size_t i = 0, size = planStorage.size(); i < size; ++i) { @@ -2252,7 +2245,7 @@ public: } } - OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size, flags); + OCL_FftPlan * newPlan = new OCL_FftPlan(dft_size); planStorage.push_back(newPlan); return newPlan; } @@ -2275,13 +2268,13 @@ protected: static bool ocl_dft_C2C_rows(InputArray _src, OutputArray _dst, int nonzero_rows, int flags, int fftType) { - const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols(), flags); + const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.cols()); return plan->enqueueTransform(_src, _dst, nonzero_rows, flags, fftType, true); } static bool ocl_dft_C2C_cols(InputArray _src, OutputArray _dst, int nonzero_cols, int flags, int fftType) { - const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows(), flags); + const OCL_FftPlan* plan = OCL_FftPlanCache::getInstance().getFftPlan(_src.rows()); return plan->enqueueTransform(_src, _dst, nonzero_cols, flags, fftType, false); } @@ -2385,7 +2378,7 @@ static bool ocl_dft(InputArray _src, OutputArray _dst, int flags, int nonzero_ro } else { - int nonzero_cols = src.cols/2 + 1;// : src.cols; + int nonzero_cols = src.cols/2 + 1; if (!ocl_dft_C2C_cols(src, output, nonzero_cols, flags, fftType)) return false; diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp index 32db8c9..a2110f6 100644 --- a/modules/core/src/ocl.cpp +++ b/modules/core/src/ocl.cpp @@ -3002,7 +3002,8 @@ bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], sync ? 0 : &p->e); if( sync || retval != CL_SUCCESS ) { - CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS); + int a = clFinish(qq); + CV_OclDbgAssert(a == CL_SUCCESS); p->cleanupUMats(); } else @@ -3898,8 +3899,9 @@ public: if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() ) { AlignedDataPtr alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); - CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, - u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS ); + int a = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, + u->size, alignedPtr.getAlignedPtr(), 0, 0, 0); + CV_Assert( a == CL_SUCCESS ); u->markHostCopyObsolete(false); } } diff --git a/modules/core/src/opencl/fft.cl b/modules/core/src/opencl/fft.cl index dd8ff59..b8d2c67 100644 --- a/modules/core/src/opencl/fft.cl +++ b/modules/core/src/opencl/fft.cl @@ -16,7 +16,7 @@ float2 twiddle(float2 a) { } __attribute__((always_inline)) -void butterfly2(float2 a0, float2 a1, __local float2* smem, __constant const float2* twiddles, +void butterfly2(float2 a0, float2 a1, __local float2* smem, __global const float2* twiddles, const int x, const int block_size) { const int k = x & (block_size - 1); @@ -28,7 +28,7 @@ void butterfly2(float2 a0, float2 a1, __local float2* smem, __constant const flo } __attribute__((always_inline)) -void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __constant const float2* twiddles, +void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem, __global const float2* twiddles, const int x, const int block_size) { const int k = x & (block_size - 1); @@ -50,10 +50,10 @@ void butterfly4(float2 a0, float2 a1, float2 a2, float2 a3, __local float2* smem } __attribute__((always_inline)) -void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __constant const float2* twiddles, +void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __global const float2* twiddles, const int x, const int block_size) { - const int k = x & (block_size - 1); + const int k = x % block_size; a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k+block_size], a2); const int dst_ind = ((x - k) * 3) + k; @@ -68,10 +68,10 @@ void butterfly3(float2 a0, float2 a1, float2 a2, __local float2* smem, __constan } __attribute__((always_inline)) -void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __constant const float2* twiddles, +void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local float2* smem, __global const float2* twiddles, const int x, const int block_size) { - const int k = x & (block_size - 1); + const int k = x % block_size; a1 = mul_float2(twiddles[k], a1); a2 = mul_float2(twiddles[k + block_size], a2); a3 = mul_float2(twiddles[k+2*block_size], a3); @@ -109,7 +109,7 @@ void butterfly5(float2 a0, float2 a1, float2 a2, float2 a3, float2 a4, __local f } __attribute__((always_inline)) -void fft_radix2(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix2(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { float2 a0, a1; @@ -128,7 +128,7 @@ void fft_radix2(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/2; float2 a0, a1, a2, a3; @@ -151,7 +151,7 @@ void fft_radix2_B2(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix2_B3(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/3; const int x3 = x1 + 2*t/3; @@ -177,7 +177,7 @@ void fft_radix2_B3(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix2_B4(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/4; const int x2 = x1 + thread_block; @@ -207,7 +207,7 @@ void fft_radix2_B4(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix2_B5(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix2_B5(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/5; const int x2 = x1 + thread_block; @@ -240,7 +240,7 @@ void fft_radix2_B5(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix4(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix4(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { float2 a0, a1, a2, a3; @@ -258,7 +258,7 @@ void fft_radix4(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix4_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix4_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/2; float2 a0, a1, a2, a3, a4, a5, a6, a7; @@ -281,7 +281,7 @@ void fft_radix4_B2(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix4_B3(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix4_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/3; const int x3 = x2 + t/3; @@ -307,7 +307,7 @@ void fft_radix4_B3(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix8(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix8(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; float2 a0, a1, a2, a3, a4, a5, a6, a7; @@ -370,7 +370,7 @@ void fft_radix8(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix3(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix3(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { float2 a0, a1, a2; @@ -388,7 +388,7 @@ void fft_radix3(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix3_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/2; float2 a0, a1, a2, a3, a4, a5; @@ -411,7 +411,7 @@ void fft_radix3_B2(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix3_B3(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix3_B3(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1 + t/3; const int x3 = x2 + t/3; @@ -437,7 +437,7 @@ void fft_radix3_B3(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix3_B4(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix3_B4(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int thread_block = t/4; const int x2 = x1 + thread_block; @@ -467,7 +467,7 @@ void fft_radix3_B4(__local float2* smem, __constant const float2* twiddles, cons } __attribute__((always_inline)) -void fft_radix5(__local float2* smem, __constant const float2* twiddles, const int x, const int block_size, const int t) +void fft_radix5(__local float2* smem, __global const float2* twiddles, const int x, const int block_size, const int t) { const int k = x % block_size; float2 a0, a1, a2, a3, a4; @@ -486,7 +486,7 @@ void fft_radix5(__local float2* smem, __constant const float2* twiddles, const i } __attribute__((always_inline)) -void fft_radix5_B2(__local float2* smem, __constant const float2* twiddles, const int x1, const int block_size, const int t) +void fft_radix5_B2(__local float2* smem, __global const float2* twiddles, const int x1, const int block_size, const int t) { const int x2 = x1+t/2; float2 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9; @@ -516,24 +516,23 @@ void fft_radix5_B2(__local float2* smem, __constant const float2* twiddles, cons __kernel void fft_multi_radix_rows(__global const uchar* src_ptr, int src_step, int src_offset, int src_rows, int src_cols, __global uchar* dst_ptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, - __constant float2 * twiddles_ptr, const int t, const int nz) + __global float2* twiddles_ptr, const int t, const int nz) { const int x = get_global_id(0); const int y = get_group_id(1); - + const int block_size = LOCAL_SIZE/kercn; if (y < nz) { __local float2 smem[LOCAL_SIZE]; - __constant const float2* twiddles = (__constant float2*) twiddles_ptr; + __global const float2* twiddles = (__global float2*) twiddles_ptr; const int ind = x; - const int block_size = LOCAL_SIZE/kercn; #ifdef IS_1D float scale = 1.f/dst_cols; #else float scale = 1.f/(dst_cols*dst_rows); #endif -#ifndef REAL_INPUT +#ifdef COMPLEX_INPUT __global const float2* src = (__global const float2*)(src_ptr + mad24(y, src_step, mad24(x, (int)(sizeof(float)*2), src_offset))); #pragma unroll for (int i=0; i(df) << std::endl; double eps = src.size().area() * 1e-4; @@ -188,13 +185,12 @@ OCL_TEST_P(MulSpectrums, Mat) OCL_INSTANTIATE_TEST_CASE_P(OCL_ImgProc, MulSpectrums, testing::Combine(Bool(), Bool())); -OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(4, 1), cv::Size(5, 8), cv::Size(6, 6), - cv::Size(512, 1), cv::Size(1280, 768)), - Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R), +OCL_INSTANTIATE_TEST_CASE_P(Core, Dft, Combine(Values(cv::Size(10, 10), cv::Size(36, 36), cv::Size(512, 1), cv::Size(1280, 768)), + Values((OCL_FFT_TYPE) R2C, (OCL_FFT_TYPE) C2C, (OCL_FFT_TYPE) R2R, (OCL_FFT_TYPE) C2R), Bool(), // DFT_INVERSE Bool(), // DFT_ROWS Bool(), // DFT_SCALE - Bool() // inplace + Bool() // hint ) ); -- 2.7.4