From: yao Date: Fri, 5 Apr 2013 13:29:29 +0000 (+0800) Subject: remove the C3 kernels in arithm, as the oclMat will never store 3 channels data X-Git-Tag: accepted/tizen/ivi/20140515.103456~1^2~824^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=bee970ab94384664c005142083e806bcf4a870f0;p=profile%2Fivi%2Fopencv.git remove the C3 kernels in arithm, as the oclMat will never store 3 channels data --- diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl index 6471715..f8f32cd 100644 --- a/modules/ocl/src/opencl/arithm_add.cl +++ b/modules/ocl/src/opencl/arithm_add.cl @@ -675,322 +675,7 @@ __kernel void arithm_add_with_mask_C2_D6 (__global double *src1, int src1_step, } } #endif -__kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) + convert_short4_sat(src2_data_0)); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) + convert_short4_sat(src2_data_1)); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) + convert_short4_sat(src2_data_2)); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0)); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1)); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0)); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1)); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0)); - float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4)); - float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 + src2_data_0; - float tmp_data_1 = src1_data_1 + src2_data_1; - float tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 )); - double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 )); - double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 + src2_data_0; - double tmp_data_1 = src1_data_1 + src2_data_1; - double tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl index 15ae95d..152b5a1 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar.cl @@ -382,274 +382,7 @@ __kernel void arithm_s_add_C2_D6 (__global double *src1, int src1_step, int sr } } #endif -__kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_int4_sat(src1_data_0) + src2_data_0); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_int4_sat(src1_data_1) + src2_data_1); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2); - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + src2_data_0); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + src2_data_1); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + src2_data_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + src2_data_0); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + src2_data_1); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + src2_data_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2); - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 + src2_data_0; - float tmp_data_1 = src1_data_1 + src2_data_1; - float tmp_data_2 = src1_data_2 + src2_data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 + src2_data_0; - double tmp_data_1 = src1_data_1 + src2_data_1; - double tmp_data_2 = src1_data_2 + src2_data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_add_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, int4 src2, int rows, int cols, int dst_step1) diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl index 1e2ae71..673e323 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl @@ -446,311 +446,6 @@ __kernel void arithm_s_add_with_mask_C2_D6 (__global double *src1, int src1_st } #endif -__kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_int4_sat(src1_data_0) + src2_data_0); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_int4_sat(src1_data_1) + src2_data_1); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + src2_data_0); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + src2_data_1); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + src2_data_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + src2_data_0); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + src2_data_1); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + src2_data_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 + src2_data_0; - float tmp_data_1 = src1_data_1 + src2_data_1; - float tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 + src2_data_0; - double tmp_data_1 = src1_data_1 + src2_data_1; - double tmp_data_2 = src1_data_2 + src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - __kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl index fbc4236..5e0428f 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl @@ -565,397 +565,6 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 ( } - -__kernel void arithm_bitwise_and_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 & src2_data_0; - uchar4 tmp_data_1 = src1_data_1 & src2_data_1; - uchar4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_bitwise_and_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = vload4(0, src2 + src2_index + 0); - char4 src2_data_1 = vload4(0, src2 + src2_index + 4); - char4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_bitwise_and_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 & src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 & src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 & src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_and_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 & src2_data_0 ; - short2 tmp_data_1 = src1_data_1 & src2_data_1 ; - short2 tmp_data_2 = src1_data_2 & src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_and_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 & src2_data_0 ; - int tmp_data_1 = src1_data_1 & src2_data_1 ; - int tmp_data_2 = src1_data_2 & src2_data_2 ; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_and_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0)); - char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4)); - char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_and_with_mask_C3_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 )); - char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 )); - char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - - char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 )); - char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 )); - char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 )); - char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 )); - char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - - char8 tmp_data_0 = src1_data_0 & src2_data_0; - char8 tmp_data_1 = src1_data_1 & src2_data_1; - char8 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - - __kernel void arithm_bitwise_and_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl index 5058d31..9605476 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl @@ -461,340 +461,7 @@ __kernel void arithm_s_bitwise_and_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_and_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 & src2_data_0; - uchar4 tmp_data_1 = src1_data_1 & src2_data_1; - uchar4 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_and_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = convert_char4_sat(convert_uchar4_sat(src1_data_0) & convert_uchar4_sat(src2_data_0)); - char4 tmp_data_1 = convert_char4_sat(convert_uchar4_sat(src1_data_1) & convert_uchar4_sat(src2_data_1)); - char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2)); - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_and_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 & src2_data_0; - ushort2 tmp_data_1 = src1_data_1 & src2_data_1; - ushort2 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_and_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 & src2_data_0; - short2 tmp_data_1 = src1_data_1 & src2_data_1; - short2 tmp_data_2 = src1_data_2 & src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_and_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 & src2_data_0; - int tmp_data_1 = src1_data_1 & src2_data_1; - int tmp_data_2 = src1_data_2 & src2_data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_bitwise_and_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 & src2_data_0; - char4 tmp_data_1 = src1_data_1 & src2_data_1; - char4 tmp_data_2 = src1_data_2 & src2_data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 & src2_data_0; - short4 tmp_data_1 = src1_data_1 & src2_data_1; - short4 tmp_data_2 = src1_data_2 & src2_data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_bitwise_and_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl index 2523edd..f2cc36e 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl @@ -566,396 +566,6 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 ( #endif -__kernel void arithm_bitwise_or_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 | src2_data_0; - uchar4 tmp_data_1 = src1_data_1 | src2_data_1; - uchar4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_bitwise_or_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = vload4(0, src2 + src2_index + 0); - char4 src2_data_1 = vload4(0, src2 + src2_index + 4); - char4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_bitwise_or_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_or_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 | src2_data_0 ; - short2 tmp_data_1 = src1_data_1 | src2_data_1 ; - short2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_or_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 | src2_data_0 ; - int tmp_data_1 = src1_data_1 | src2_data_1 ; - int tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_or_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0)); - char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4)); - char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C3_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 )); - char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 )); - char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - - char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 )); - char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 )); - char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 )); - char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 )); - char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - - char8 tmp_data_0 = src1_data_0 | src2_data_0; - char8 tmp_data_1 = src1_data_1 | src2_data_1; - char8 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - - __kernel void arithm_bitwise_or_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl index fdcc00c..7ade345 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl @@ -457,344 +457,7 @@ __kernel void arithm_s_bitwise_or_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_or_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 | src2_data_0 ; - uchar4 tmp_data_1 = src1_data_1 | src2_data_1 ; - uchar4 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_or_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_or_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 | src2_data_0 ; - short2 tmp_data_1 = src1_data_1 | src2_data_1 ; - short2 tmp_data_2 = src1_data_2 | src2_data_2 ; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 | src2_data_0; - int tmp_data_1 = src1_data_1 | src2_data_1; - int tmp_data_2 = src1_data_2 | src2_data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_bitwise_or_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) - -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 | src2_data_0; - short4 tmp_data_1 = src1_data_1 | src2_data_1; - short4 tmp_data_2 = src1_data_2 | src2_data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_bitwise_or_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl index 8baa9a2..b8f07a8 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl @@ -533,387 +533,7 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_or_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 | src2_data_0; - uchar4 tmp_data_1 = src1_data_1 | src2_data_1; - uchar4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_or_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_or_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 | src2_data_0; - ushort2 tmp_data_1 = src1_data_1 | src2_data_1; - ushort2 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 | src2_data_0; - short2 tmp_data_1 = src1_data_1 | src2_data_1; - short2 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 | src2_data_0; - int tmp_data_1 = src1_data_1 | src2_data_1; - int tmp_data_2 = src1_data_2 | src2_data_2; - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_or_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) - -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 | src2_data_0; - char4 tmp_data_1 = src1_data_1 | src2_data_1; - char4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - uchar mask_data = * (mask + mask_index); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 | src2_data_0; - short4 tmp_data_1 = src1_data_1 | src2_data_1; - short4 tmp_data_2 = src1_data_2 | src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_s_bitwise_or_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl index 48bd3e4..7655be3 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl @@ -565,397 +565,6 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 ( } #endif - -__kernel void arithm_bitwise_xor_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0; - uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1; - uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_bitwise_xor_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = vload4(0, src2 + src2_index + 0); - char4 src2_data_1 = vload4(0, src2 + src2_index + 4); - char4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_bitwise_xor_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0 ; - ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1 ; - ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_xor_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 ^ src2_data_0 ; - short2 tmp_data_1 = src1_data_1 ^ src2_data_1 ; - short2 tmp_data_2 = src1_data_2 ^ src2_data_2 ; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_xor_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 ^ src2_data_0 ; - int tmp_data_1 = src1_data_1 ^ src2_data_1 ; - int tmp_data_2 = src1_data_2 ^ src2_data_2 ; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_bitwise_xor_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0)); - char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4)); - char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C3_D6 ( - __global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 )); - char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 )); - char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16)); - - char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 )); - char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 )); - char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 )); - char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 )); - char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16)); - - char8 tmp_data_0 = src1_data_0 ^ src2_data_0; - char8 tmp_data_1 = src1_data_1 ^ src2_data_1; - char8 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif - - __kernel void arithm_bitwise_xor_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl index 2c6dd50..73b5687 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl @@ -461,340 +461,7 @@ __kernel void arithm_s_bitwise_xor_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_xor_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0; - uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1; - uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_xor_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_xor_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0; - ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1; - ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 ^ src2_data_0; - short2 tmp_data_1 = src1_data_1 ^ src2_data_1; - short2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 ^ src2_data_0; - int tmp_data_1 = src1_data_1 ^ src2_data_1; - int tmp_data_2 = src1_data_2 ^ src2_data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_bitwise_xor_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 ^ src2_data_0; - short4 tmp_data_1 = src1_data_1 ^ src2_data_1; - short4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_bitwise_xor_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl index 26ca59c..ad481aa 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl @@ -523,380 +523,7 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D6 ( } } #endif -__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 ( - __global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); - uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); - uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0; - uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1; - uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} - - -__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - char4 src1_data_0 = vload4(0, src1 + src1_index + 0); - char4 src1_data_1 = vload4(0, src1 + src1_index + 4); - char4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); - char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); - char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global char4 *)(dst + dst_index + 0)) = data_0; - *((__global char4 *)(dst + dst_index + 4)) = data_1; - *((__global char4 *)(dst + dst_index + 8)) = data_2; - } -} - -__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 ( - __global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); - ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); - ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0; - ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1; - ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - -#ifdef dst_align -#undef dst_align -#endif -#define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = (short2)(src2.x, src2.y); - short2 src2_data_1 = (short2)(src2.z, src2.x); - short2 src2_data_2 = (short2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = src1_data_0 ^ src2_data_0; - short2 tmp_data_1 = src1_data_1 ^ src2_data_1; - short2 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 ( - __global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = src1_data_0 ^ src2_data_0; - int tmp_data_1 = src1_data_1 ^ src2_data_1; - int tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 ( - __global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); - char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); - char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); - - char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); - char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); - char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); - - uchar mask_data = * (mask + mask_index); - - char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); - char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); - char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); - - char4 tmp_data_0 = src1_data_0 ^ src2_data_0; - char4 tmp_data_1 = src1_data_1 ^ src2_data_1; - char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 ( - __global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); - short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); - short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); - - short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); - short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); - short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); - - uchar mask_data = * (mask + mask_index); - - short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); - short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); - short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); - - short4 tmp_data_0 = src1_data_0 ^ src2_data_0; - short4 tmp_data_1 = src1_data_1 ^ src2_data_1; - short4 tmp_data_2 = src1_data_2 ^ src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_s_bitwise_xor_with_mask_C4_D0 ( __global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, diff --git a/modules/ocl/src/opencl/arithm_sub.cl b/modules/ocl/src/opencl/arithm_sub.cl index 9cf3797..d461d3a 100644 --- a/modules/ocl/src/opencl/arithm_sub.cl +++ b/modules/ocl/src/opencl/arithm_sub.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -618,313 +622,7 @@ __kernel void arithm_sub_with_mask_C2_D6 (__global double *src1, int src1_step, } } #endif -__kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0); - uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4); - uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) - convert_short4_sat(src2_data_0)); - uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) - convert_short4_sat(src2_data_1)); - uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) - convert_short4_sat(src2_data_2)); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0)); - ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4)); - ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) - convert_int2_sat(src2_data_0)); - ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) - convert_int2_sat(src2_data_1)); - ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) - convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0)); - short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4)); - short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8)); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) - convert_int2_sat(src2_data_0)); - short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) - convert_int2_sat(src2_data_1)); - short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) - convert_int2_sat(src2_data_2)); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0)); - int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4)); - int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - int tmp_data_0 = convert_int_sat((long)src1_data_0 - (long)src2_data_0); - int tmp_data_1 = convert_int_sat((long)src1_data_1 - (long)src2_data_1); - int tmp_data_2 = convert_int_sat((long)src1_data_2 - (long)src2_data_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 12) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0)); - float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4)); - float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8)); - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 - src2_data_0; - float tmp_data_1 = src1_data_1 - src2_data_1; - float tmp_data_2 = src1_data_2 - src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global double *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int src2_index = mad24(y, src2_step, (x * 24) + src2_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 )); - double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 )); - double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16)); - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 - src2_data_0; - double tmp_data_1 = src1_data_1 - src2_data_1; - double tmp_data_2 = src1_data_2 - src2_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, __global uchar *mask, int mask_step, int mask_offset, diff --git a/modules/ocl/src/opencl/arithm_sub_scalar.cl b/modules/ocl/src/opencl/arithm_sub_scalar.cl index 782bcd0..76bb294 100644 --- a/modules/ocl/src/opencl/arithm_sub_scalar.cl +++ b/modules/ocl/src/opencl/arithm_sub_scalar.cl @@ -42,9 +42,12 @@ // the use of this software, even if advised of the possibility of such damage. // //M*/ - #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif /**************************************sub with scalar without mask**************************************/ __kernel void arithm_s_sub_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, @@ -372,305 +375,7 @@ __kernel void arithm_s_sub_C2_D6 (__global double *src1, int src1_step, int sr } } #endif -__kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0; - int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1; - int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0); - uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1); - uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2); - - data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0); - ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1); - ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - short2 tmp_data_0 = convert_short2_sat(tmp_0); - short2 tmp_data_1 = convert_short2_sat(tmp_1); - short2 tmp_data_2 = convert_short2_sat(tmp_2); - - data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - long tmp_0 = (long)src1_data_0 - (long)src2_data_0; - long tmp_1 = (long)src1_data_1 - (long)src2_data_1; - long tmp_2 = (long)src1_data_2 - (long)src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - int tmp_data_0 = convert_int_sat(tmp_0); - int tmp_data_1 = convert_int_sat(tmp_1); - int tmp_data_2 = convert_int_sat(tmp_2); - - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; - } -} -__kernel void arithm_s_sub_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_0 = src1_data_0 - src2_data_0; - float tmp_1 = src1_data_1 - src2_data_1; - float tmp_2 = src1_data_2 - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= tmp_0; - *((__global float *)((__global char *)dst + dst_index + 4))= tmp_1; - *((__global float *)((__global char *)dst + dst_index + 8))= tmp_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 - src2_data_0; - double tmp_data_1 = src1_data_1 - src2_data_1; - double tmp_data_2 = src1_data_2 - src2_data_2; - - tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0; - tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1; - tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; - } -} -#endif __kernel void arithm_s_sub_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) diff --git a/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl b/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl index 1353549..9b758cf 100644 --- a/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif /**************************************sub with scalar with mask**************************************/ @@ -430,341 +434,7 @@ __kernel void arithm_s_sub_with_mask_C2_D6 (__global double *src1, int src1_st } } #endif -__kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) - int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); - - uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); - uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); - uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); - - int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x); - int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y); - int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z); - - uchar4 mask_data = vload4(0, mask + mask_index); - - uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); - uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); - uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); - - int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0; - int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1; - int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0); - uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1); - uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2); - - data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; - data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_0.w : data_0.w; - - data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) - ? tmp_data_1.xy : data_1.xy; - data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.zw : data_1.zw; - - data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.x : data_2.x; - data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) - ? tmp_data_2.yzw : data_2.yzw; - - *((__global uchar4 *)(dst + dst_index + 0)) = data_0; - *((__global uchar4 *)(dst + dst_index + 4)) = data_1; - *((__global uchar4 *)(dst + dst_index + 8)) = data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - - ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); - ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); - ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); - ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); - ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0); - ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1); - ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) - int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); - int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); - - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); - short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); - short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); - short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); - - int2 src2_data_0 = (int2)(src2.x, src2.y); - int2 src2_data_1 = (int2)(src2.z, src2.x); - int2 src2_data_2 = (int2)(src2.y, src2.z); - - uchar2 mask_data = vload2(0, mask + mask_index); - - short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); - short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); - short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); - - int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0; - int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1; - int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - short2 tmp_data_0 = convert_short2_sat(tmp_0); - short2 tmp_data_1 = convert_short2_sat(tmp_1); - short2 tmp_data_2 = convert_short2_sat(tmp_2); - - data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; - - data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; - data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; - - data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; - - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); - int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); - int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); - - int src2_data_0 = src2.x; - int src2_data_1 = src2.y; - int src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); - int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); - int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); - - long tmp_0 = (long)src1_data_0 - (long)src2_data_0; - long tmp_1 = (long)src1_data_1 - (long)src2_data_1; - long tmp_2 = (long)src1_data_2 - (long)src2_data_2; - - tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0; - tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1; - tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2; - - int tmp_data_0 = convert_int_sat(tmp_0); - int tmp_data_1 = convert_int_sat(tmp_1); - int tmp_data_2 = convert_int_sat(tmp_2); - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; - } -} -__kernel void arithm_s_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); - - float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0)); - float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4)); - float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8)); - - float src2_data_0 = src2.x; - float src2_data_1 = src2.y; - float src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - float data_0 = *((__global float *)((__global char *)dst + dst_index + 0)); - float data_1 = *((__global float *)((__global char *)dst + dst_index + 4)); - float data_2 = *((__global float *)((__global char *)dst + dst_index + 8)); - - float tmp_data_0 = src1_data_0 - src2_data_0; - float tmp_data_1 = src1_data_1 - src2_data_1; - float tmp_data_2 = src1_data_2 - src2_data_2; - - tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0; - tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1; - tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; - } -} - -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar) -{ - - int x = get_global_id(0); - int y = get_global_id(1); - - if (x < cols && y < rows) - { - int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); - int mask_index = mad24(y, mask_step, x + mask_offset); - int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); - - double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 )); - double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 )); - double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16)); - - double src2_data_0 = src2.x; - double src2_data_1 = src2.y; - double src2_data_2 = src2.z; - - uchar mask_data = * (mask + mask_index); - - double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 )); - double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 )); - double data_2 = *((__global double *)((__global char *)dst + dst_index + 16)); - - double tmp_data_0 = src1_data_0 - src2_data_0; - double tmp_data_1 = src1_data_1 - src2_data_1; - double tmp_data_2 = src1_data_2 - src2_data_2; - - tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0; - tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1; - tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2; - - data_0 = mask_data ? tmp_data_0 : data_0; - data_1 = mask_data ? tmp_data_1 : data_1; - data_2 = mask_data ? tmp_data_2 : data_2; - - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; - } -} -#endif __kernel void arithm_s_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, __global uchar *mask, int mask_step, int mask_offset, diff --git a/modules/ocl/test/test_arithm.cpp b/modules/ocl/test/test_arithm.cpp index f643864..e46fdbd 100644 --- a/modules/ocl/test/test_arithm.cpp +++ b/modules/ocl/test/test_arithm.cpp @@ -1531,6 +1531,10 @@ INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); +INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine( + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), + Values(false))); + INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine( Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); // Values(false) is the reserved parameter @@ -1586,19 +1590,19 @@ INSTANTIATE_TEST_CASE_P(Arithm, Phase, Combine(Values(CV_32FC1, CV_32FC3, CV_32F INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine( - Values(CV_8UC1, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine( - Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine( - Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine( - Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); + Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false))); //Values(false) is the reserved parameter INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false)));