rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
{
- bdepth = CV_32S;
- kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
- kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
+ if (ocl::Device::getDefault().isIntel())
+ {
+ for (int i=0; i<kernelX.cols; i++)
+ kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
+ if (kernelX.data != kernelY.data)
+ for (int i=0; i<kernelX.cols; i++)
+ kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
+ } else
+ {
+ bdepth = CV_32S;
+ kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
+ kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
+ }
int_arithm = true;
}
{
temp[0] = LDS_DAT[l_y + RADIUSY - i][l_x];
temp[1] = LDS_DAT[l_y + RADIUSY + i][l_x];
-#ifndef INTEGER_ARITHMETIC
- sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
-#else
+#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum += mad24(temp[0],mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
+#else
+ sum += mad(temp[0], mat_kernel[RADIUSY - i], temp[1] * mat_kernel[RADIUSY + i]);
#endif
}
#ifdef INTEGER_ARITHMETIC
+#ifdef INTEL_DEVICE
+ sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
+#else
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
+#endif
#endif
// write the result to dst
#define DIG(a) a,
__constant dstT1 mat_kernel[] = { COEFF };
-#ifndef INTEGER_ARITHMETIC
-#define dstT4 float4
-#define convertDstVec convert_float4
-#else
+#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
#define dstT4 int4
#define convertDstVec convert_int4
+#else
+#define dstT4 float4
+#define convertDstVec convert_float4
#endif
__kernel void row_filter_C1_D0(__global const uchar * src, int src_step_in_pixel, int src_offset_x, int src_offset_y,
{
temp[0] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset - i);
temp[1] = vload4(0, (__local uchar*)&LDS_DAT[l_y][l_x] + RADIUSX + offset + i);
-#ifndef INTEGER_ARITHMETIC
- sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
-#else
+#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum += mad24(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
+#else
+ sum += mad(convertDstVec(temp[0]), mat_kernel[RADIUSX-i], convertDstVec(temp[1]) * mat_kernel[RADIUSX + i]);
#endif
}
{
temp[0] = LDS_DAT[l_y][l_x + RADIUSX - i];
temp[1] = LDS_DAT[l_y][l_x + RADIUSX + i];
-#ifndef INTEGER_ARITHMETIC
- sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
-#else
+#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum += mad24(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
+#else
+ sum += mad(convertToDstT(temp[0]), mat_kernel[RADIUSX - i], convertToDstT(temp[1]) * mat_kernel[RADIUSX + i]);
#endif
}
{
sum = (WT) 0;
for (i=0; i<=2*RADIUSY; i++)
-#ifndef INTEGER_ARITHMETIC
- sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
-#else
+#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum = mad24(lsmem[liy+i][clocX], mat_kernelY[i], sum);
+#else
+ sum = mad(lsmem[liy+i][clocX], mat_kernelY[i], sum);
#endif
lsmemDy[liy][clocX] = sum;
clocX += BLK_X;
// and calculate final result
sum = 0.0f;
for (i=0; i<=2*RADIUSX; i++)
-#ifndef INTEGER_ARITHMETIC
- sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
-#else
+#if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
+#else
+ sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
+#endif
+#ifdef INTEGER_ARITHMETIC
+#ifdef INTEL_DEVICE
+ sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
+#else
sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
+#endif
#endif
// store result into destination image