return k.run(2, globalsize, localsize, false);
}
+const int shift_bits = 8;
+
static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
- int borderType, int ddepth, bool fast8uc1)
+ int borderType, int ddepth, bool fast8uc1, bool int_arithm)
{
int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
char cvt[40];
cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
- " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s",
+ " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s",
radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
ocl::typeToStr(type), ocl::typeToStr(buf_type),
ocl::convertTypeStr(sdepth, bdepth, cn, cvt),
ocl::typeToStr(sdepth), ocl::typeToStr(bdepth),
- doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+ doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+ int_arithm ? " -D INTEGER_ARITHMETIC" : "");
build_options += ocl::kernelToStr(kernelX, bdepth);
Size srcWholeSize; Point srcOffset;
return k.run(2, globalsize, localsize, false);
}
-static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, int bits)
+static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
{
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if (dst.depth() == CV_64F && !doubleSupport)
char cvt[40];
cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
" -D srcT=%s -D dstT=%s -D convertToDstT=%s"
- " -D srcT1=%s -D dstT1=%s -D BITS=%d%s",
+ " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
anchor, (int)localsize[0], (int)localsize[1], cn,
ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
- bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "");
+ 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
+ int_arithm ? " -D INTEGER_ARITHMETIC" : "");
build_options += ocl::kernelToStr(kernelY, bdepth);
ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
if (ddepth < 0)
ddepth = sdepth;
- //CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
- // imgSize.width > optimizedSepFilterLocalSize + (kernelX.cols >> 1) &&
- // imgSize.height > optimizedSepFilterLocalSize + (kernelY.cols >> 1) &&
- // (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && anchor == Point(-1, -1) &&
- // (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
- // ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
- // borderType & ~BORDER_ISOLATED, ddepth), true)
-
if (anchor.x < 0)
anchor.x = kernelX.cols >> 1;
if (anchor.y < 0)
anchor.y = kernelY.cols >> 1;
- UMat src = _src.getUMat();
- Size srcWholeSize; Point srcOffset;
- src.locateROI(srcWholeSize, srcOffset);
-
- //bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
- // src.cols % 4 == 0 && src.step % 4 == 0;
- bool fast8uc1 = false;
-
int rtype = getKernelType(kernelX,
kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
int ctype = getKernelType(kernelY,
kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
int bdepth = CV_32F;
- int bits = 0;
-
+ bool int_arithm = false;
if( sdepth == CV_8U &&
((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
ddepth == CV_8U)))
{
bdepth = CV_32S;
- bits = 8;
- _kernelX.getMat().convertTo( kernelX, CV_32S, 1 << bits );
- _kernelY.getMat().convertTo( kernelY, CV_32S, 1 << bits );
- kernelX = kernelX.reshape(1,1);
- kernelY = kernelY.reshape(1,1);
- bits *= 2;
- delta *= (1 << bits);
+ _kernelX.getMat().reshape(1,1).convertTo( kernelX, CV_32S, 1 << shift_bits );
+ _kernelY.getMat().reshape(1,1).convertTo( kernelY, CV_32S, 1 << shift_bits );
+ int_arithm = true;
}
+ CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && !int_arithm &&
+ imgSize.width > optimizedSepFilterLocalSize + anchor.x &&
+ imgSize.height > optimizedSepFilterLocalSize + anchor.y &&
+ (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
+ anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
+ (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
+ ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
+ borderType & ~BORDER_ISOLATED, ddepth), true)
+
+ UMat src = _src.getUMat();
+ Size srcWholeSize; Point srcOffset;
+ src.locateROI(srcWholeSize, srcOffset);
+
+ bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
+ src.cols % 4 == 0 && src.step % 4 == 0 && !int_arithm;
+
Size srcSize = src.size();
Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
- if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1))
+ if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
return false;
- Mat buffer = buf.getMat(ACCESS_READ);
-
_dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
UMat dst = _dst.getUMat();
- return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, bits);
+ return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
}
#endif
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2014, Itseez, Inc, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
{
temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
+#ifndef INTEGER_ARITHMETIC
sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
- //sum += temp[0]*mat_kernel[RADIUSY - i] + temp[1] * mat_kernel[RADIUSY + i];
+#else
+ sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
+#endif
}
-#if BITS > 0
- sum = sum >> BITS;
+#ifdef INTEGER_ARITHMETIC
+ sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
#endif
// write the result to dst
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2014, Itseez, Inc, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
{
temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
+#ifndef INTEGER_ARITHMETIC
sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
- //sum += convertToDstT(temp[0])*mat_kernel[RADIUSX - i] + convertToDstT(temp[1]) * mat_kernel[RADIUSX + i];
+#else
+ sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
+#endif
}
// write the result to dst
+++ /dev/null
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2014, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////Macro for border type////////////////////////////////////////////
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
-#ifdef BORDER_CONSTANT
-// CCCCCC|abcdefgh|CCCCCCC
-#define EXTRAPOLATE(x, maxV)
-#elif defined BORDER_REPLICATE
-// aaaaaa|abcdefgh|hhhhhhh
-#define EXTRAPOLATE(x, maxV) \
- { \
- (x) = max(min((x), (maxV) - 1), 0); \
- }
-#elif defined BORDER_WRAP
-// cdefgh|abcdefgh|abcdefg
-#define EXTRAPOLATE(x, maxV) \
- { \
- (x) = ( (x) + (maxV) ) % (maxV); \
- }
-#elif defined BORDER_REFLECT
-// fedcba|abcdefgh|hgfedcb
-#define EXTRAPOLATE(x, maxV) \
- { \
- (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
- }
-#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
-// gfedcb|abcdefgh|gfedcba
-#define EXTRAPOLATE(x, maxV) \
- { \
- (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
- }
-#else
-#error No extrapolation method
-#endif
-
-#if CN != 3
-#define loadpix(addr) *(__global const srcT *)(addr)
-#define storepix(val, addr) *(__global dstT *)(addr) = val
-#define SRCSIZE (int)sizeof(srcT)
-#define DSTSIZE (int)sizeof(dstT)
-#else
-#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
-#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
-#define SRCSIZE (int)sizeof(srcT1)*3
-#define DSTSIZE (int)sizeof(dstT1)*3
-#endif
-
-#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
-
-#ifdef BORDER_CONSTANT
-// CCCCCC|abcdefgh|CCCCCCC
-#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
-#else
-#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
-#endif
-
-#define noconvert
-
-// horizontal and vertical filter kernels
-// should be defined on host during compile time to avoid overhead
-#define DIG(a) a,
-__constant int mat_kernelX[] = { KERNEL_MATRIX_X };
-__constant int mat_kernelY[] = { KERNEL_MATRIX_Y };
-
-__kernel void gaussian_blur_8u(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
- __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
-{
- // RADIUSX, RADIUSY are filter dimensions
- // BLK_X, BLK_Y are local wrogroup sizes
- // all these should be defined on host during compile time
- // first lsmem array for source pixels used in first pass,
- // second lsmemDy for storing first pass results
- __local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX];
- __local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX];
-
- // get local and global ids - used as image and local memory array indexes
- int lix = get_local_id(0);
- int liy = get_local_id(1);
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- // calculate pixel position in source image taking image offset into account
- int srcX = x + srcOffsetX - RADIUSX;
- int srcY = y + srcOffsetY - RADIUSY;
- int xb = srcX;
- int yb = srcY;
-
- // extrapolate coordinates, if needed
- // and read my own source pixel into local memory
- // with account for extra border pixels, which will be read by starting workitems
- int clocY = liy;
- int cSrcY = srcY;
- do
- {
- int yb = cSrcY;
- EXTRAPOLATE(yb, (height));
-
- int clocX = lix;
- int cSrcX = srcX;
- do
- {
- int xb = cSrcX;
- EXTRAPOLATE(xb,(width));
- lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
-
- clocX += BLK_X;
- cSrcX += BLK_X;
- }
- while(clocX < BLK_X+(RADIUSX*2));
-
- clocY += BLK_Y;
- cSrcY += BLK_Y;
- }
- while (clocY < BLK_Y+(RADIUSY*2));
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // do vertical filter pass
- // and store intermediate results to second local memory array
- int i, clocX = lix;
- WT sum = 0;
- do
- {
- sum = 0;
- for (i=0; i<=2*RADIUSY; i++)
- sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
- lsmemDy[liy][clocX] = sum;
- clocX += BLK_X;
- }
- while(clocX < BLK_X+(RADIUSX*2));
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // if this pixel happened to be out of image borders because of global size rounding,
- // then just return
- if( x >= dst_cols || y >=dst_rows )
- return;
-
- // do second horizontal filter pass
- // and calculate final result
- sum = 0;
- for (i=0; i<=2*RADIUSX; i++)
- sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
-
- sum = sum >> (GAUSSIAN_COEF_BITS * 2);
-
- //store result into destination image
- storepix(convertToDstT(sum), Dst + mad24(y, dst_step, mad24(x, DSTSIZE, dst_offset)));
-}
#include "precomp.hpp"
#include "opencl_kernels.hpp"
-#include <iostream>
/*
* This file includes the code, contributed by Simon Perreault
ky = getGaussianKernel( ksize.height, sigma2, std::max(depth, CV_32F) );
}
-#define GAUSSIAN_COEF_BITS 11
-
-static bool GaussianBlur_8u(InputArray _src, OutputArray _dst, Size ksize,
- double sigma1, double sigma2,
- int borderType)
-{
- int type = _src.type();
- Mat kx, ky;
- createGaussianKernels(kx, ky, CV_64F, ksize, sigma1, sigma2);
- Mat kx_8u, ky_8u;
-
- int scale_coef = 1 << GAUSSIAN_COEF_BITS;
- kx.convertTo(kx_8u, CV_32S, scale_coef);
- ky.convertTo(ky_8u, CV_32S, scale_coef);
-
- kx_8u.reshape(1, 1);
- ky_8u.reshape(1, 1);
-
- Size size = _src.size(), wholeSize;
- Point origin;
- int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
- esz = CV_ELEM_SIZE(stype), wdepth = CV_32S,
- ddepth = sdepth;
- size_t src_step = _src.step(), src_offset = _src.offset();
-
- if ((src_offset % src_step) % esz != 0 || !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
- borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
- borderType == BORDER_REFLECT_101))
- return false;
-
- size_t lt2[2] = { 16, 16 };
- size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1] * (1 + (size.height - 1) / lt2[1]) };
-
- char cvt[2][40];
- const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
- "BORDER_REFLECT_101" };
-
- String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
- " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
- " -D %s -D srcT1=%s -D dstT1=%s -D CN=%d -D GAUSSIAN_COEF_BITS=%d", (int)lt2[0], (int)lt2[1],
- kx.rows / 2, kx.rows / 2,
- ocl::kernelToStr(kx_8u, CV_32S, "KERNEL_MATRIX_X").c_str(),
- ocl::kernelToStr(ky_8u, CV_32S, "KERNEL_MATRIX_Y").c_str(),
- ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
- ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(stype),
- ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
- ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), cn, GAUSSIAN_COEF_BITS);
-
- ocl::Kernel k("gaussian_blur_8u", ocl::imgproc::gaussian_blur_8u_oclsrc, opts);
- if (k.empty())
- return false;
-
- UMat src = _src.getUMat();
- _dst.create(size, stype);
- UMat dst = _dst.getUMat();
-
- int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
- int src_offset_y = static_cast<int>(src_offset / src_step);
-
- src.locateROI(wholeSize, origin);
-
- k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
- wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst));
-
- return k.run(2, gt2, lt2, false);
-}
-
}
cv::Ptr<cv::FilterEngine> cv::createGaussianFilter( int type, Size ksize,
}
-
-
void cv::GaussianBlur( InputArray _src, OutputArray _dst, Size ksize,
double sigma1, double sigma2,
int borderType )
}
#endif
- //if (type == CV_8U)
- //{
- // CV_OCL_RUN_(_dst.isUMat() && _src.dims() <= 2 &&
- // (!(borderType & BORDER_ISOLATED) || _src.offset() == 0),
- // GaussianBlur_8u(_src, _dst, ksize, sigma1, sigma2, borderType))
- //}
-
Mat kx, ky;
createGaussianKernels(kx, ky, type, ksize, sigma1, sigma2);
sepFilter2D(_src, _dst, CV_MAT_DEPTH(type), kx, ky, Point(-1,-1), 0, borderType );
OCL_TEST_P(GaussianBlurTest, Mat)
{
- for (int j = 0; j < test_loop_times + 100; j++)
+ for (int j = 0; j < test_loop_times; j++)
{
random_roi();
OCL_OFF(cv::GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType));
OCL_ON(cv::GaussianBlur(usrc_roi, udst_roi, Size(ksize, ksize), sigma1, sigma2, borderType));
-
- if (checkNorm2(dst_roi, udst_roi) > 2 && CV_MAT_DEPTH(type) == CV_8U)
- {
- std::cout << "i = " << j << std::endl;
- Mat uudst = udst_roi.getMat(ACCESS_READ);
- Mat diff;
- absdiff(dst_roi, udst, diff);
- int nonZero = countNonZero(diff);
- double max;
- Point maxn;
- minMaxLoc(diff, (double*)0, &max, (Point*) 0, &maxn);
-
- uchar a = dst_roi.at<uchar>(maxn);
- uchar b = uudst.at<uchar>(maxn);
-
- std::cout << "dst_roi" << dst_roi << std::endl;
- std::cout << "udst_roi" << uudst << std::endl;
- }
-
-
-
- Near(CV_MAT_DEPTH(type) == CV_8U ? 1 : 5e-5, false);
+ Near(CV_MAT_DEPTH(type) >= CV_32F ? 5e-5 : 1, false);
}
}