}
}
#endif
-__kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
- __global uchar *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0);
- uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4);
- uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) + convert_short4_sat(src2_data_0));
- uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) + convert_short4_sat(src2_data_1));
- uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) + convert_short4_sat(src2_data_2));
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-__kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
- __global ushort *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0));
- ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4));
- ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0));
- ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1));
- ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2));
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
- __global short *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global short *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0));
- short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4));
- short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + convert_int2_sat(src2_data_0));
- short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + convert_int2_sat(src2_data_1));
- short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + convert_int2_sat(src2_data_2));
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
- __global int *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global int *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0));
- int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4));
- int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0);
- int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
- int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
- __global float *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global float *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
- float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
- float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
- float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
- float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
- float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
- float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
- float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
- float tmp_data_0 = src1_data_0 + src2_data_0;
- float tmp_data_1 = src1_data_1 + src2_data_1;
- float tmp_data_2 = src1_data_2 + src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset,
- __global double *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global double *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
- double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
- double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
- double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
- double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
- double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
-
- uchar mask_data = * (mask + mask_index);
-
- double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
- double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
- double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
- double tmp_data_0 = src1_data_0 + src2_data_0;
- double tmp_data_1 = src1_data_1 + src2_data_1;
- double tmp_data_2 = src1_data_2 + src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
- *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
- *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
- }
-}
-#endif
__kernel void arithm_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
}
}
#endif
-__kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
- int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
- int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = convert_uchar4_sat(convert_int4_sat(src1_data_0) + src2_data_0);
- uchar4 tmp_data_1 = convert_uchar4_sat(convert_int4_sat(src1_data_1) + src2_data_1);
- uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);
-
- data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-__kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- int2 src2_data_0 = (int2)(src2.x, src2.y);
- int2 src2_data_1 = (int2)(src2.z, src2.x);
- int2 src2_data_2 = (int2)(src2.y, src2.z);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + src2_data_0);
- ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + src2_data_1);
- ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + src2_data_2);
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- int2 src2_data_0 = (int2)(src2.x, src2.y);
- int2 src2_data_1 = (int2)(src2.z, src2.x);
- int2 src2_data_2 = (int2)(src2.y, src2.z);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + src2_data_0);
- short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + src2_data_1);
- short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + src2_data_2);
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_offset,
- __global int *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = src2.x;
- int src2_data_1 = src2.y;
- int src2_data_2 = src2.z;
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0);
- int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
- int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
-
- *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
- }
-}
-__kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src1_offset,
- __global float *dst, int dst_step, int dst_offset,
- float4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
- float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
- float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
- float src2_data_0 = src2.x;
- float src2_data_1 = src2.y;
- float src2_data_2 = src2.z;
-
- float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
- float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
- float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
- float tmp_data_0 = src1_data_0 + src2_data_0;
- float tmp_data_1 = src1_data_1 + src2_data_1;
- float tmp_data_2 = src1_data_2 + src2_data_2;
-
- *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
- *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
- *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
- }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int src1_offset,
- __global double *dst, int dst_step, int dst_offset,
- double4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
- double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
- double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
- double src2_data_0 = src2.x;
- double src2_data_1 = src2.y;
- double src2_data_2 = src2.z;
-
- double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
- double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
- double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
- double tmp_data_0 = src1_data_0 + src2_data_0;
- double tmp_data_1 = src1_data_1 + src2_data_1;
- double tmp_data_2 = src1_data_2 + src2_data_2;
-
- *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
- *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
- *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
- }
-}
-#endif
__kernel void arithm_s_add_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
}
#endif
-__kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
- int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
- int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = convert_uchar4_sat(convert_int4_sat(src1_data_0) + src2_data_0);
- uchar4 tmp_data_1 = convert_uchar4_sat(convert_int4_sat(src1_data_1) + src2_data_1);
- uchar4 tmp_data_2 = convert_uchar4_sat(convert_int4_sat(src1_data_2) + src2_data_2);
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-__kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- int2 src2_data_0 = (int2)(src2.x, src2.y);
- int2 src2_data_1 = (int2)(src2.z, src2.x);
- int2 src2_data_2 = (int2)(src2.y, src2.z);
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) + src2_data_0);
- ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) + src2_data_1);
- ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) + src2_data_2);
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- int2 src2_data_0 = (int2)(src2.x, src2.y);
- int2 src2_data_1 = (int2)(src2.z, src2.x);
- int2 src2_data_2 = (int2)(src2.y, src2.z);
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) + src2_data_0);
- short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) + src2_data_1);
- short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) + src2_data_2);
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
- __global int *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = src2.x;
- int src2_data_1 = src2.y;
- int src2_data_2 = src2.z;
-
- uchar mask_data = * (mask + mask_index);
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = convert_int_sat((long)src1_data_0 + (long)src2_data_0);
- int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
- int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
- __global float *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- float4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
- float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
- float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
- float src2_data_0 = src2.x;
- float src2_data_1 = src2.y;
- float src2_data_2 = src2.z;
-
- uchar mask_data = * (mask + mask_index);
-
- float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
- float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
- float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
- float tmp_data_0 = src1_data_0 + src2_data_0;
- float tmp_data_1 = src1_data_1 + src2_data_1;
- float tmp_data_2 = src1_data_2 + src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset,
- __global double *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- double4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
- double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
- double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
- double src2_data_0 = src2.x;
- double src2_data_1 = src2.y;
- double src2_data_2 = src2.z;
-
- uchar mask_data = * (mask + mask_index);
-
- double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
- double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
- double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
- double tmp_data_0 = src1_data_0 + src2_data_0;
- double tmp_data_1 = src1_data_1 + src2_data_1;
- double tmp_data_2 = src1_data_2 + src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
- *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
- *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
- }
-}
-#endif
-
__kernel void arithm_s_add_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
}
-
-__kernel void arithm_bitwise_and_with_mask_C3_D0 (
- __global uchar *src1, int src1_step, int src1_offset,
- __global uchar *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0);
- uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4);
- uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
- uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
- uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-
-__kernel void arithm_bitwise_and_with_mask_C3_D1 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global char *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- char4 src2_data_0 = vload4(0, src2 + src2_index + 0);
- char4 src2_data_1 = vload4(0, src2 + src2_index + 4);
- char4 src2_data_2 = vload4(0, src2 + src2_index + 8);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 & src2_data_0;
- char4 tmp_data_1 = src1_data_1 & src2_data_1;
- char4 tmp_data_2 = src1_data_2 & src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global char4 *)(dst + dst_index + 0)) = data_0;
- *((__global char4 *)(dst + dst_index + 4)) = data_1;
- *((__global char4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-__kernel void arithm_bitwise_and_with_mask_C3_D2 (
- __global ushort *src1, int src1_step, int src1_offset,
- __global ushort *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0));
- ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4));
- ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = src1_data_0 & src2_data_0 ;
- ushort2 tmp_data_1 = src1_data_1 & src2_data_1 ;
- ushort2 tmp_data_2 = src1_data_2 & src2_data_2 ;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_bitwise_and_with_mask_C3_D3 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global short *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0));
- short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4));
- short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = src1_data_0 & src2_data_0 ;
- short2 tmp_data_1 = src1_data_1 & src2_data_1 ;
- short2 tmp_data_2 = src1_data_2 & src2_data_2 ;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_bitwise_and_with_mask_C3_D4 (
- __global int *src1, int src1_step, int src1_offset,
- __global int *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global int *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0));
- int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4));
- int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = src1_data_0 & src2_data_0 ;
- int tmp_data_1 = src1_data_1 & src2_data_1 ;
- int tmp_data_2 = src1_data_2 & src2_data_2 ;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_bitwise_and_with_mask_C3_D5 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global char *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
- char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
- char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-
- char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
- char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
- char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 & src2_data_0;
- char4 tmp_data_1 = src1_data_1 & src2_data_1;
- char4 tmp_data_2 = src1_data_2 & src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_and_with_mask_C3_D6 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global char *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
- char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
- char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-
- char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
- char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
- char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
-
- uchar mask_data = * (mask + mask_index);
-
- char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 ));
- char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 ));
- char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16));
-
- char8 tmp_data_0 = src1_data_0 & src2_data_0;
- char8 tmp_data_1 = src1_data_1 & src2_data_1;
- char8 tmp_data_2 = src1_data_2 & src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
- *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
- *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
- }
-}
-#endif
-
-
__kernel void arithm_bitwise_and_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
}
}
#endif
-__kernel void arithm_s_bitwise_and_C3_D0 (
- __global uchar *src1, int src1_step, int src1_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- uchar4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
- uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
- uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = src1_data_0 & src2_data_0;
- uchar4 tmp_data_1 = src1_data_1 & src2_data_1;
- uchar4 tmp_data_2 = src1_data_2 & src2_data_2;
-
- data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-
-__kernel void arithm_s_bitwise_and_C3_D1 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- char4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
- char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
- char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
-
- char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
-
- char4 tmp_data_0 = convert_char4_sat(convert_uchar4_sat(src1_data_0) & convert_uchar4_sat(src2_data_0));
- char4 tmp_data_1 = convert_char4_sat(convert_uchar4_sat(src1_data_1) & convert_uchar4_sat(src2_data_1));
- char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2));
-
- data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global char4 *)(dst + dst_index + 0)) = data_0;
- *((__global char4 *)(dst + dst_index + 4)) = data_1;
- *((__global char4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-__kernel void arithm_s_bitwise_and_C3_D2 (
- __global ushort *src1, int src1_step, int src1_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- ushort4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
- ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
- ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = src1_data_0 & src2_data_0;
- ushort2 tmp_data_1 = src1_data_1 & src2_data_1;
- ushort2 tmp_data_2 = src1_data_2 & src2_data_2;
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_and_C3_D3 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- short4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = (short2)(src2.x, src2.y);
- short2 src2_data_1 = (short2)(src2.z, src2.x);
- short2 src2_data_2 = (short2)(src2.y, src2.z);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = src1_data_0 & src2_data_0;
- short2 tmp_data_1 = src1_data_1 & src2_data_1;
- short2 tmp_data_2 = src1_data_2 & src2_data_2;
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_and_C3_D4 (
- __global int *src1, int src1_step, int src1_offset,
- __global int *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = src2.x;
- int src2_data_1 = src2.y;
- int src2_data_2 = src2.z;
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = src1_data_0 & src2_data_0;
- int tmp_data_1 = src1_data_1 & src2_data_1;
- int tmp_data_2 = src1_data_2 & src2_data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
- }
-}
-__kernel void arithm_s_bitwise_and_C3_D5 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- char16 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
- char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
- char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-
- char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
- char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
- char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
-
- char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 & src2_data_0;
- char4 tmp_data_1 = src1_data_1 & src2_data_1;
- char4 tmp_data_2 = src1_data_2 & src2_data_2;
-
- *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
- *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
- *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
- }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_and_C3_D6 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- short16 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
- short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
- short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-
- short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
- short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
- short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-
- short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
- short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
- short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
-
- short4 tmp_data_0 = src1_data_0 & src2_data_0;
- short4 tmp_data_1 = src1_data_1 & src2_data_1;
- short4 tmp_data_2 = src1_data_2 & src2_data_2;
-
- *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
- *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
- *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
- }
-}
-#endif
__kernel void arithm_s_bitwise_and_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
#endif
-__kernel void arithm_bitwise_or_with_mask_C3_D0 (
- __global uchar *src1, int src1_step, int src1_offset,
- __global uchar *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0);
- uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4);
- uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = src1_data_0 | src2_data_0;
- uchar4 tmp_data_1 = src1_data_1 | src2_data_1;
- uchar4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-
-__kernel void arithm_bitwise_or_with_mask_C3_D1 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global char *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- char4 src2_data_0 = vload4(0, src2 + src2_index + 0);
- char4 src2_data_1 = vload4(0, src2 + src2_index + 4);
- char4 src2_data_2 = vload4(0, src2 + src2_index + 8);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 | src2_data_0;
- char4 tmp_data_1 = src1_data_1 | src2_data_1;
- char4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global char4 *)(dst + dst_index + 0)) = data_0;
- *((__global char4 *)(dst + dst_index + 4)) = data_1;
- *((__global char4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-__kernel void arithm_bitwise_or_with_mask_C3_D2 (
- __global ushort *src1, int src1_step, int src1_offset,
- __global ushort *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0));
- ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4));
- ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ;
- ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ;
- ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_bitwise_or_with_mask_C3_D3 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global short *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0));
- short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4));
- short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = src1_data_0 | src2_data_0 ;
- short2 tmp_data_1 = src1_data_1 | src2_data_1 ;
- short2 tmp_data_2 = src1_data_2 | src2_data_2 ;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_bitwise_or_with_mask_C3_D4 (
- __global int *src1, int src1_step, int src1_offset,
- __global int *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global int *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0));
- int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4));
- int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = src1_data_0 | src2_data_0 ;
- int tmp_data_1 = src1_data_1 | src2_data_1 ;
- int tmp_data_2 = src1_data_2 | src2_data_2 ;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_bitwise_or_with_mask_C3_D5 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global char *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
- char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
- char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-
- char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
- char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
- char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 | src2_data_0;
- char4 tmp_data_1 = src1_data_1 | src2_data_1;
- char4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_or_with_mask_C3_D6 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global char *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
- char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
- char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-
- char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
- char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
- char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
-
- uchar mask_data = * (mask + mask_index);
-
- char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 ));
- char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 ));
- char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16));
-
- char8 tmp_data_0 = src1_data_0 | src2_data_0;
- char8 tmp_data_1 = src1_data_1 | src2_data_1;
- char8 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
- *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
- *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
- }
-}
-#endif
-
-
__kernel void arithm_bitwise_or_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
}
}
#endif
-__kernel void arithm_s_bitwise_or_C3_D0 (
- __global uchar *src1, int src1_step, int src1_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- uchar4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
- uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
- uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = src1_data_0 | src2_data_0 ;
- uchar4 tmp_data_1 = src1_data_1 | src2_data_1 ;
- uchar4 tmp_data_2 = src1_data_2 | src2_data_2 ;
-
- data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-
-__kernel void arithm_s_bitwise_or_C3_D1 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- char4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
- char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
- char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
-
- char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 | src2_data_0;
- char4 tmp_data_1 = src1_data_1 | src2_data_1;
- char4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global char4 *)(dst + dst_index + 0)) = data_0;
- *((__global char4 *)(dst + dst_index + 4)) = data_1;
- *((__global char4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-__kernel void arithm_s_bitwise_or_C3_D2 (
- __global ushort *src1, int src1_step, int src1_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- ushort4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
- ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
- ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ;
- ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ;
- ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ;
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_or_C3_D3 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- short4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = (short2)(src2.x, src2.y);
- short2 src2_data_1 = (short2)(src2.z, src2.x);
- short2 src2_data_2 = (short2)(src2.y, src2.z);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = src1_data_0 | src2_data_0 ;
- short2 tmp_data_1 = src1_data_1 | src2_data_1 ;
- short2 tmp_data_2 = src1_data_2 | src2_data_2 ;
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_or_C3_D4 (
- __global int *src1, int src1_step, int src1_offset,
- __global int *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = src2.x;
- int src2_data_1 = src2.y;
- int src2_data_2 = src2.z;
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = src1_data_0 | src2_data_0;
- int tmp_data_1 = src1_data_1 | src2_data_1;
- int tmp_data_2 = src1_data_2 | src2_data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
- }
-}
-__kernel void arithm_s_bitwise_or_C3_D5 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- char16 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
-
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
- char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
- char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-
- char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
- char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
- char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
-
- char4 tmp_data_0 = src1_data_0 | src2_data_0;
- char4 tmp_data_1 = src1_data_1 | src2_data_1;
- char4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
- *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
- *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
- }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_C3_D6 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- short16 src2, int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
- short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
- short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-
- short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
- short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
- short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-
- short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
- short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
- short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
-
- short4 tmp_data_0 = src1_data_0 | src2_data_0;
- short4 tmp_data_1 = src1_data_1 | src2_data_1;
- short4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
- *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
- *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
- }
-}
-#endif
__kernel void arithm_s_bitwise_or_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
}
}
#endif
-__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (
- __global uchar *src1, int src1_step, int src1_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- uchar4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
- uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
- uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = src1_data_0 | src2_data_0;
- uchar4 tmp_data_1 = src1_data_1 | src2_data_1;
- uchar4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-
-__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- char4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
- char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
- char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 | src2_data_0;
- char4 tmp_data_1 = src1_data_1 | src2_data_1;
- char4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global char4 *)(dst + dst_index + 0)) = data_0;
- *((__global char4 *)(dst + dst_index + 4)) = data_1;
- *((__global char4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (
- __global ushort *src1, int src1_step, int src1_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- ushort4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
- ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
- ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = src1_data_0 | src2_data_0;
- ushort2 tmp_data_1 = src1_data_1 | src2_data_1;
- ushort2 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- short4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = (short2)(src2.x, src2.y);
- short2 src2_data_1 = (short2)(src2.z, src2.x);
- short2 src2_data_2 = (short2)(src2.y, src2.z);
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = src1_data_0 | src2_data_0;
- short2 tmp_data_1 = src1_data_1 | src2_data_1;
- short2 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (
- __global int *src1, int src1_step, int src1_offset,
- __global int *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = src2.x;
- int src2_data_1 = src2.y;
- int src2_data_2 = src2.z;
-
- uchar mask_data = * (mask + mask_index);
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = src1_data_0 | src2_data_0;
- int tmp_data_1 = src1_data_1 | src2_data_1;
- int tmp_data_2 = src1_data_2 | src2_data_2;
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- char16 src2, int rows, int cols, int dst_step1)
-
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
- char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
- char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-
- char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
- char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
- char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
-
- uchar mask_data = * (mask + mask_index);
-
- char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 | src2_data_0;
- char4 tmp_data_1 = src1_data_1 | src2_data_1;
- char4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
-
- }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- short16 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
- short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
- short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-
- short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
- short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
- short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-
- uchar mask_data = * (mask + mask_index);
-
- short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
- short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
- short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
-
- short4 tmp_data_0 = src1_data_0 | src2_data_0;
- short4 tmp_data_1 = src1_data_1 | src2_data_1;
- short4 tmp_data_2 = src1_data_2 | src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
- *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
- *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
- }
-}
-#endif
__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
}
#endif
-
-__kernel void arithm_bitwise_xor_with_mask_C3_D0 (
- __global uchar *src1, int src1_step, int src1_offset,
- __global uchar *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0);
- uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4);
- uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-
-__kernel void arithm_bitwise_xor_with_mask_C3_D1 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global char *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- char4 src2_data_0 = vload4(0, src2 + src2_index + 0);
- char4 src2_data_1 = vload4(0, src2 + src2_index + 4);
- char4 src2_data_2 = vload4(0, src2 + src2_index + 8);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global char4 *)(dst + dst_index + 0)) = data_0;
- *((__global char4 *)(dst + dst_index + 4)) = data_1;
- *((__global char4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-__kernel void arithm_bitwise_xor_with_mask_C3_D2 (
- __global ushort *src1, int src1_step, int src1_offset,
- __global ushort *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0));
- ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4));
- ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0 ;
- ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1 ;
- ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2 ;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_bitwise_xor_with_mask_C3_D3 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global short *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0));
- short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4));
- short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = src1_data_0 ^ src2_data_0 ;
- short2 tmp_data_1 = src1_data_1 ^ src2_data_1 ;
- short2 tmp_data_2 = src1_data_2 ^ src2_data_2 ;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_bitwise_xor_with_mask_C3_D4 (
- __global int *src1, int src1_step, int src1_offset,
- __global int *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global int *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0));
- int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4));
- int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = src1_data_0 ^ src2_data_0 ;
- int tmp_data_1 = src1_data_1 ^ src2_data_1 ;
- int tmp_data_2 = src1_data_2 ^ src2_data_2 ;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_bitwise_xor_with_mask_C3_D5 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global char *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
- char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
- char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-
- char4 src2_data_0 = *((__global char4 *)((__global char *)src2 + src2_index + 0));
- char4 src2_data_1 = *((__global char4 *)((__global char *)src2 + src2_index + 4));
- char4 src2_data_2 = *((__global char4 *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_bitwise_xor_with_mask_C3_D6 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global char *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- char8 src1_data_0 = *((__global char8 *)((__global char *)src1 + src1_index + 0 ));
- char8 src1_data_1 = *((__global char8 *)((__global char *)src1 + src1_index + 8 ));
- char8 src1_data_2 = *((__global char8 *)((__global char *)src1 + src1_index + 16));
-
- char8 src2_data_0 = *((__global char8 *)((__global char *)src2 + src2_index + 0 ));
- char8 src2_data_1 = *((__global char8 *)((__global char *)src2 + src2_index + 8 ));
- char8 src2_data_2 = *((__global char8 *)((__global char *)src2 + src2_index + 16));
-
- uchar mask_data = * (mask + mask_index);
-
- char8 data_0 = *((__global char8 *)((__global char *)dst + dst_index + 0 ));
- char8 data_1 = *((__global char8 *)((__global char *)dst + dst_index + 8 ));
- char8 data_2 = *((__global char8 *)((__global char *)dst + dst_index + 16));
-
- char8 tmp_data_0 = src1_data_0 ^ src2_data_0;
- char8 tmp_data_1 = src1_data_1 ^ src2_data_1;
- char8 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
- *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
- *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
- }
-}
-#endif
-
-
__kernel void arithm_bitwise_xor_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
}
}
#endif
-__kernel void arithm_s_bitwise_xor_C3_D0 (
- __global uchar *src1, int src1_step, int src1_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- uchar4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
- uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
- uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-
-__kernel void arithm_s_bitwise_xor_C3_D1 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- char4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
- char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
- char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
-
- char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global char4 *)(dst + dst_index + 0)) = data_0;
- *((__global char4 *)(dst + dst_index + 4)) = data_1;
- *((__global char4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-__kernel void arithm_s_bitwise_xor_C3_D2 (
- __global ushort *src1, int src1_step, int src1_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- ushort4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
- ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
- ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0;
- ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1;
- ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_xor_C3_D3 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- short4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = (short2)(src2.x, src2.y);
- short2 src2_data_1 = (short2)(src2.z, src2.x);
- short2 src2_data_2 = (short2)(src2.y, src2.z);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = src1_data_0 ^ src2_data_0;
- short2 tmp_data_1 = src1_data_1 ^ src2_data_1;
- short2 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_xor_C3_D4 (
- __global int *src1, int src1_step, int src1_offset,
- __global int *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = src2.x;
- int src2_data_1 = src2.y;
- int src2_data_2 = src2.z;
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = src1_data_0 ^ src2_data_0;
- int tmp_data_1 = src1_data_1 ^ src2_data_1;
- int tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
- }
-}
-__kernel void arithm_s_bitwise_xor_C3_D5 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- char16 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
- char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
- char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-
- char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
- char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
- char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
-
- char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
- *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
- *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
- }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_C3_D6 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- short16 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
- short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
- short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-
- short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
- short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
- short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-
- short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
- short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
- short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
-
- short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
- *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
- *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
- }
-}
-#endif
__kernel void arithm_s_bitwise_xor_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
}
}
#endif
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (
- __global uchar *src1, int src1_step, int src1_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- uchar4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x);
- uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y);
- uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- uchar4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- uchar4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- char4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- char4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- char4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- char4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x);
- char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y);
- char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- char4 data_0 = *((__global char4 *)(dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)(dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)(dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global char4 *)(dst + dst_index + 0)) = data_0;
- *((__global char4 *)(dst + dst_index + 4)) = data_1;
- *((__global char4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (
- __global ushort *src1, int src1_step, int src1_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- ushort4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = (ushort2)(src2.x, src2.y);
- ushort2 src2_data_1 = (ushort2)(src2.z, src2.x);
- ushort2 src2_data_2 = (ushort2)(src2.y, src2.z);
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = src1_data_0 ^ src2_data_0;
- ushort2 tmp_data_1 = src1_data_1 ^ src2_data_1;
- ushort2 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- short4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
-#ifdef dst_align
-#undef dst_align
-#endif
-#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = (short2)(src2.x, src2.y);
- short2 src2_data_1 = (short2)(src2.z, src2.x);
- short2 src2_data_2 = (short2)(src2.y, src2.z);
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = src1_data_0 ^ src2_data_0;
- short2 tmp_data_1 = src1_data_1 ^ src2_data_1;
- short2 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (
- __global int *src1, int src1_step, int src1_offset,
- __global int *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = src2.x;
- int src2_data_1 = src2.y;
- int src2_data_2 = src2.z;
-
- uchar mask_data = * (mask + mask_index);
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = src1_data_0 ^ src2_data_0;
- int tmp_data_1 = src1_data_1 ^ src2_data_1;
- int tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (
- __global char *src1, int src1_step, int src1_offset,
- __global char *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- char16 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0));
- char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4));
- char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8));
-
- char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3);
- char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7);
- char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB);
-
- uchar mask_data = * (mask + mask_index);
-
- char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0));
- char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4));
- char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8));
-
- char4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (
- __global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- short16 src2, int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 ));
- short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 ));
- short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16));
-
- short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3);
- short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7);
- short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb);
-
- uchar mask_data = * (mask + mask_index);
-
- short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 ));
- short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 ));
- short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16));
-
- short4 tmp_data_0 = src1_data_0 ^ src2_data_0;
- short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
- short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
- *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
- *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
- }
-}
-#endif
__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
//M*/
#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
}
}
#endif
-__kernel void arithm_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
- __global uchar *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
- #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- uchar4 src2_data_0 = vload4(0, src2 + src2_index + 0);
- uchar4 src2_data_1 = vload4(0, src2 + src2_index + 4);
- uchar4 src2_data_2 = vload4(0, src2 + src2_index + 8);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- uchar4 tmp_data_0 = convert_uchar4_sat(convert_short4_sat(src1_data_0) - convert_short4_sat(src2_data_0));
- uchar4 tmp_data_1 = convert_uchar4_sat(convert_short4_sat(src1_data_1) - convert_short4_sat(src2_data_1));
- uchar4 tmp_data_2 = convert_uchar4_sat(convert_short4_sat(src1_data_2) - convert_short4_sat(src2_data_2));
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-__kernel void arithm_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
- __global ushort *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
- #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- ushort2 src2_data_0 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 0));
- ushort2 src2_data_1 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 4));
- ushort2 src2_data_2 = vload2(0, (__global ushort *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- ushort2 tmp_data_0 = convert_ushort2_sat(convert_int2_sat(src1_data_0) - convert_int2_sat(src2_data_0));
- ushort2 tmp_data_1 = convert_ushort2_sat(convert_int2_sat(src1_data_1) - convert_int2_sat(src2_data_1));
- ushort2 tmp_data_2 = convert_ushort2_sat(convert_int2_sat(src1_data_2) - convert_int2_sat(src2_data_2));
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
- __global short *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global short *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
- #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- short2 src2_data_0 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 0));
- short2 src2_data_1 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 4));
- short2 src2_data_2 = vload2(0, (__global short *)((__global char *)src2 + src2_index + 8));
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- short2 tmp_data_0 = convert_short2_sat(convert_int2_sat(src1_data_0) - convert_int2_sat(src2_data_0));
- short2 tmp_data_1 = convert_short2_sat(convert_int2_sat(src1_data_1) - convert_int2_sat(src2_data_1));
- short2 tmp_data_2 = convert_short2_sat(convert_int2_sat(src1_data_2) - convert_int2_sat(src2_data_2));
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
- __global int *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global int *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = *((__global int *)((__global char *)src2 + src2_index + 0));
- int src2_data_1 = *((__global int *)((__global char *)src2 + src2_index + 4));
- int src2_data_2 = *((__global int *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- int tmp_data_0 = convert_int_sat((long)src1_data_0 - (long)src2_data_0);
- int tmp_data_1 = convert_int_sat((long)src1_data_1 - (long)src2_data_1);
- int tmp_data_2 = convert_int_sat((long)src1_data_2 - (long)src2_data_2);
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
- __global float *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global float *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 12) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
- float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
- float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
- float src2_data_0 = *((__global float *)((__global char *)src2 + src2_index + 0));
- float src2_data_1 = *((__global float *)((__global char *)src2 + src2_index + 4));
- float src2_data_2 = *((__global float *)((__global char *)src2 + src2_index + 8));
-
- uchar mask_data = * (mask + mask_index);
-
- float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
- float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
- float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
- float tmp_data_0 = src1_data_0 - src2_data_0;
- float tmp_data_1 = src1_data_1 - src2_data_1;
- float tmp_data_2 = src1_data_2 - src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset,
- __global double *src2, int src2_step, int src2_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- __global double *dst, int dst_step, int dst_offset,
- int rows, int cols, int dst_step1)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int src2_index = mad24(y, src2_step, (x * 24) + src2_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
- double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
- double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
- double src2_data_0 = *((__global double *)((__global char *)src2 + src2_index + 0 ));
- double src2_data_1 = *((__global double *)((__global char *)src2 + src2_index + 8 ));
- double src2_data_2 = *((__global double *)((__global char *)src2 + src2_index + 16));
-
- uchar mask_data = * (mask + mask_index);
-
- double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
- double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
- double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
- double tmp_data_0 = src1_data_0 - src2_data_0;
- double tmp_data_1 = src1_data_1 - src2_data_1;
- double tmp_data_2 = src1_data_2 - src2_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
- *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
- *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
- }
-}
-#endif
__kernel void arithm_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
-
#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
#endif
/**************************************sub with scalar without mask**************************************/
__kernel void arithm_s_sub_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
}
}
#endif
-__kernel void arithm_s_sub_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
- #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
- int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
- int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0;
- int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1;
- int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2;
-
- tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
- tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
- tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
-
- uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0);
- uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1);
- uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
-
- data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-__kernel void arithm_s_sub_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
- #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- int2 src2_data_0 = (int2)(src2.x, src2.y);
- int2 src2_data_1 = (int2)(src2.z, src2.x);
- int2 src2_data_2 = (int2)(src2.y, src2.z);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
- int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
- int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
-
- tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
- tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
- tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
-
- ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0);
- ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1);
- ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2);
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_sub_C3_D3 (__global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
- #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- int2 src2_data_0 = (int2)(src2.x, src2.y);
- int2 src2_data_1 = (int2)(src2.z, src2.x);
- int2 src2_data_2 = (int2)(src2.y, src2.z);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
- int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
- int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
-
- tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
- tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
- tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
-
- short2 tmp_data_0 = convert_short2_sat(tmp_0);
- short2 tmp_data_1 = convert_short2_sat(tmp_1);
- short2 tmp_data_2 = convert_short2_sat(tmp_2);
-
- data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_sub_C3_D4 (__global int *src1, int src1_step, int src1_offset,
- __global int *dst, int dst_step, int dst_offset,
- int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = src2.x;
- int src2_data_1 = src2.y;
- int src2_data_2 = src2.z;
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
- long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
- long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
-
- tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
- tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
- tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
-
- int tmp_data_0 = convert_int_sat(tmp_0);
- int tmp_data_1 = convert_int_sat(tmp_1);
- int tmp_data_2 = convert_int_sat(tmp_2);
-
- *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
- }
-}
-__kernel void arithm_s_sub_C3_D5 (__global float *src1, int src1_step, int src1_offset,
- __global float *dst, int dst_step, int dst_offset,
- float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
- float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
- float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
- float src2_data_0 = src2.x;
- float src2_data_1 = src2.y;
- float src2_data_2 = src2.z;
-
- float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
- float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
- float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
- float tmp_0 = src1_data_0 - src2_data_0;
- float tmp_1 = src1_data_1 - src2_data_1;
- float tmp_2 = src1_data_2 - src2_data_2;
-
- tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
- tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
- tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
-
- *((__global float *)((__global char *)dst + dst_index + 0))= tmp_0;
- *((__global float *)((__global char *)dst + dst_index + 4))= tmp_1;
- *((__global float *)((__global char *)dst + dst_index + 8))= tmp_2;
- }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_sub_C3_D6 (__global double *src1, int src1_step, int src1_offset,
- __global double *dst, int dst_step, int dst_offset,
- double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
- double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
- double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
- double src2_data_0 = src2.x;
- double src2_data_1 = src2.y;
- double src2_data_2 = src2.z;
-
- double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
- double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
- double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
- double tmp_data_0 = src1_data_0 - src2_data_0;
- double tmp_data_1 = src1_data_1 - src2_data_1;
- double tmp_data_2 = src1_data_2 - src2_data_2;
-
- tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
- tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
- tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
-
- *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
- *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
- *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
- }
-}
-#endif
__kernel void arithm_s_sub_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
//M*/
#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
#endif
/**************************************sub with scalar with mask**************************************/
}
}
#endif
-__kernel void arithm_s_sub_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
- __global uchar *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 2;
-
- #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
- int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3));
-
- uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0);
- uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4);
- uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8);
-
- int4 src2_data_0 = (int4)(src2.x, src2.y, src2.z, src2.x);
- int4 src2_data_1 = (int4)(src2.y, src2.z, src2.x, src2.y);
- int4 src2_data_2 = (int4)(src2.z, src2.x, src2.y, src2.z);
-
- uchar4 mask_data = vload4(0, mask + mask_index);
-
- uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0));
- uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4));
- uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8));
-
- int4 tmp_0 = convert_int4_sat(src1_data_0) - src2_data_0;
- int4 tmp_1 = convert_int4_sat(src1_data_1) - src2_data_1;
- int4 tmp_2 = convert_int4_sat(src1_data_2) - src2_data_2;
-
- tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
- tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
- tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
-
- uchar4 tmp_data_0 = convert_uchar4_sat(tmp_0);
- uchar4 tmp_data_1 = convert_uchar4_sat(tmp_1);
- uchar4 tmp_data_2 = convert_uchar4_sat(tmp_2);
-
- data_0.xyz = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz;
- data_0.w = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_0.w : data_0.w;
-
- data_1.xy = ((mask_data.y) && (dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end))
- ? tmp_data_1.xy : data_1.xy;
- data_1.zw = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.zw : data_1.zw;
-
- data_2.x = ((mask_data.z) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.x : data_2.x;
- data_2.yzw = ((mask_data.w) && (dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end))
- ? tmp_data_2.yzw : data_2.yzw;
-
- *((__global uchar4 *)(dst + dst_index + 0)) = data_0;
- *((__global uchar4 *)(dst + dst_index + 4)) = data_1;
- *((__global uchar4 *)(dst + dst_index + 8)) = data_2;
- }
-}
-__kernel void arithm_s_sub_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
- __global ushort *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
- #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
-
- ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0));
- ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4));
- ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8));
-
- int2 src2_data_0 = (int2)(src2.x, src2.y);
- int2 src2_data_1 = (int2)(src2.z, src2.x);
- int2 src2_data_2 = (int2)(src2.y, src2.z);
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0));
- ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4));
- ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8));
-
- int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
- int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
- int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
-
- tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
- tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
- tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
-
- ushort2 tmp_data_0 = convert_ushort2_sat(tmp_0);
- ushort2 tmp_data_1 = convert_ushort2_sat(tmp_1);
- ushort2 tmp_data_2 = convert_ushort2_sat(tmp_2);
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_sub_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
- __global short *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- x = x << 1;
-
- #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
- int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
- int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
-
- int dst_start = mad24(y, dst_step, dst_offset);
- int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6));
- short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0));
- short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4));
- short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8));
-
- int2 src2_data_0 = (int2)(src2.x, src2.y);
- int2 src2_data_1 = (int2)(src2.z, src2.x);
- int2 src2_data_2 = (int2)(src2.y, src2.z);
-
- uchar2 mask_data = vload2(0, mask + mask_index);
-
- short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0));
- short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4));
- short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8));
-
- int2 tmp_0 = convert_int2_sat(src1_data_0) - src2_data_0;
- int2 tmp_1 = convert_int2_sat(src1_data_1) - src2_data_1;
- int2 tmp_2 = convert_int2_sat(src1_data_2) - src2_data_2;
-
- tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
- tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
- tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
-
- short2 tmp_data_0 = convert_short2_sat(tmp_0);
- short2 tmp_data_1 = convert_short2_sat(tmp_1);
- short2 tmp_data_2 = convert_short2_sat(tmp_2);
-
- data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
-
- data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
- ? tmp_data_1.x : data_1.x;
- data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_1.y : data_1.y;
-
- data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
- ? tmp_data_2.xy : data_2.xy;
-
- *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_sub_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
- __global int *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- int4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0));
- int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4));
- int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8));
-
- int src2_data_0 = src2.x;
- int src2_data_1 = src2.y;
- int src2_data_2 = src2.z;
-
- uchar mask_data = * (mask + mask_index);
-
- int data_0 = *((__global int *)((__global char *)dst + dst_index + 0));
- int data_1 = *((__global int *)((__global char *)dst + dst_index + 4));
- int data_2 = *((__global int *)((__global char *)dst + dst_index + 8));
-
- long tmp_0 = (long)src1_data_0 - (long)src2_data_0;
- long tmp_1 = (long)src1_data_1 - (long)src2_data_1;
- long tmp_2 = (long)src1_data_2 - (long)src2_data_2;
-
- tmp_0 = isMatSubScalar ? tmp_0 : -tmp_0;
- tmp_1 = isMatSubScalar ? tmp_1 : -tmp_1;
- tmp_2 = isMatSubScalar ? tmp_2 : -tmp_2;
-
- int tmp_data_0 = convert_int_sat(tmp_0);
- int tmp_data_1 = convert_int_sat(tmp_1);
- int tmp_data_2 = convert_int_sat(tmp_2);
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-__kernel void arithm_s_sub_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
- __global float *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- float4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 12) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 12));
-
- float src1_data_0 = *((__global float *)((__global char *)src1 + src1_index + 0));
- float src1_data_1 = *((__global float *)((__global char *)src1 + src1_index + 4));
- float src1_data_2 = *((__global float *)((__global char *)src1 + src1_index + 8));
-
- float src2_data_0 = src2.x;
- float src2_data_1 = src2.y;
- float src2_data_2 = src2.z;
-
- uchar mask_data = * (mask + mask_index);
-
- float data_0 = *((__global float *)((__global char *)dst + dst_index + 0));
- float data_1 = *((__global float *)((__global char *)dst + dst_index + 4));
- float data_2 = *((__global float *)((__global char *)dst + dst_index + 8));
-
- float tmp_data_0 = src1_data_0 - src2_data_0;
- float tmp_data_1 = src1_data_1 - src2_data_1;
- float tmp_data_2 = src1_data_2 - src2_data_2;
-
- tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
- tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
- tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
- *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
- *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
- }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void arithm_s_sub_with_mask_C3_D6 (__global double *src1, int src1_step, int src1_offset,
- __global double *dst, int dst_step, int dst_offset,
- __global uchar *mask, int mask_step, int mask_offset,
- double4 src2, int rows, int cols, int dst_step1, int isMatSubScalar)
-{
-
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if (x < cols && y < rows)
- {
- int src1_index = mad24(y, src1_step, (x * 24) + src1_offset);
- int mask_index = mad24(y, mask_step, x + mask_offset);
- int dst_index = mad24(y, dst_step, dst_offset + (x * 24));
-
- double src1_data_0 = *((__global double *)((__global char *)src1 + src1_index + 0 ));
- double src1_data_1 = *((__global double *)((__global char *)src1 + src1_index + 8 ));
- double src1_data_2 = *((__global double *)((__global char *)src1 + src1_index + 16));
-
- double src2_data_0 = src2.x;
- double src2_data_1 = src2.y;
- double src2_data_2 = src2.z;
-
- uchar mask_data = * (mask + mask_index);
-
- double data_0 = *((__global double *)((__global char *)dst + dst_index + 0 ));
- double data_1 = *((__global double *)((__global char *)dst + dst_index + 8 ));
- double data_2 = *((__global double *)((__global char *)dst + dst_index + 16));
-
- double tmp_data_0 = src1_data_0 - src2_data_0;
- double tmp_data_1 = src1_data_1 - src2_data_1;
- double tmp_data_2 = src1_data_2 - src2_data_2;
-
- tmp_data_0 = isMatSubScalar ? tmp_data_0 : -tmp_data_0;
- tmp_data_1 = isMatSubScalar ? tmp_data_1 : -tmp_data_1;
- tmp_data_2 = isMatSubScalar ? tmp_data_2 : -tmp_data_2;
-
- data_0 = mask_data ? tmp_data_0 : data_0;
- data_1 = mask_data ? tmp_data_1 : data_1;
- data_2 = mask_data ? tmp_data_2 : data_2;
-
- *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
- *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
- *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
- }
-}
-#endif
__kernel void arithm_s_sub_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(false)));
+INSTANTIATE_TEST_CASE_P(Arithm, Sub, Combine(
+ Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
+ Values(false)));
+
INSTANTIATE_TEST_CASE_P(Arithm, Mul, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(false))); // Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_and, Combine(
- Values(CV_8UC1, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
+ Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
//Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_or, Combine(
- Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
+ Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
//Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_xor, Combine(
- Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
+ Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
//Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(Arithm, Bitwise_not, Combine(
- Values(CV_8UC1, CV_8UC3, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
+ Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32SC1, CV_32FC1, CV_32FC3, CV_32FC4), Values(false)));
//Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(Arithm, Compare, Combine(Values(CV_8UC1, CV_32SC1, CV_32FC1), Values(false)));