// License Agreement
// For Open Source Computer Vision Library
//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
-// @Authors
-// Jia Haipeng, jiahaipeng95@gmail.com
-//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
-///////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////optimized code using vector ////////////////////////////////
-////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
-////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- __global uchar *mat_dst2, int dst2_step, int dst2_offset,
- __global uchar *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
-
- int dst3_start = mad24(y, dst3_step, dst3_offset);
- int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
- int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
-
- uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
- uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
- uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
- uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
-
- int total_bytes = src_offset + rows * src_step;
- uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
- uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
- uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
-
- uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
-
- if((dst0_offset & 3) == 3)
- tmp_data0 = (uchar4)(data_0.x, data_1.x, data_2.x, data_3.x);
- if((dst0_offset & 3) == 2)
- tmp_data0 = (uchar4)(data_1.x, data_2.x, data_3.x, data_4.x);
- if((dst0_offset & 3) == 1)
- tmp_data0 = (uchar4)(data_2.x, data_3.x, data_4.x, data_5.x);
- if((dst0_offset & 3) == 0)
- tmp_data0 = (uchar4)(data_3.x, data_4.x, data_5.x, data_6.x);
-
- if((dst1_offset & 3) == 3)
- tmp_data1 = (uchar4)(data_0.y, data_1.y, data_2.y, data_3.y);
- if((dst1_offset & 3) == 2)
- tmp_data1 = (uchar4)(data_1.y, data_2.y, data_3.y, data_4.y);
- if((dst1_offset & 3) == 1)
- tmp_data1 = (uchar4)(data_2.y, data_3.y, data_4.y, data_5.y);
- if((dst1_offset & 3) == 0)
- tmp_data1 = (uchar4)(data_3.y, data_4.y, data_5.y, data_6.y);
-
- if((dst2_offset & 3) == 3)
- tmp_data2 = (uchar4)(data_0.z, data_1.z, data_2.z, data_3.z);
- if((dst2_offset & 3) == 2)
- tmp_data2 = (uchar4)(data_1.z, data_2.z, data_3.z, data_4.z);
- if((dst2_offset & 3) == 1)
- tmp_data2 = (uchar4)(data_2.z, data_3.z, data_4.z, data_5.z);
- if((dst2_offset & 3) == 0)
- tmp_data2 = (uchar4)(data_3.z, data_4.z, data_5.z, data_6.z);
-
- if((dst3_offset & 3) == 3)
- tmp_data3 = (uchar4)(data_0.w, data_1.w, data_2.w, data_3.w);
- if((dst3_offset & 3) == 2)
- tmp_data3 = (uchar4)(data_1.w, data_2.w, data_3.w, data_4.w);
- if((dst3_offset & 3) == 1)
- tmp_data3 = (uchar4)(data_2.w, data_3.w, data_4.w, data_5.w);
- if((dst3_offset & 3) == 0)
- tmp_data3 = (uchar4)(data_3.w, data_4.w, data_5.w, data_6.w);
-
- uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
- uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
- uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
- uchar4 dst3_data = *((__global uchar4 *)(mat_dst3 + dst3_idx));
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
- tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
- tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
- tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
- tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
- tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
- tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
-
- *((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
- *((__global uchar4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
- }
-}
-
-__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- __global uchar *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- int src_idx = mad24(y, src_step, src_offset);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
- uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
- uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
- uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
-
- uchar4 tmp_data0, tmp_data1, tmp_data2;
-
- uchar src_data_0 = *(mat_src + src_idx + 3 * x - 9);
- uchar src_data_1 = *(mat_src + src_idx + 3 * x - 8);
- uchar src_data_2 = *(mat_src + src_idx + 3 * x - 7);
-
- uchar src_data_3 = *(mat_src + src_idx + 3 * x - 6);
- uchar src_data_4 = *(mat_src + src_idx + 3 * x - 5);
- uchar src_data_5 = *(mat_src + src_idx + 3 * x - 4);
-
- uchar src_data_6 = *(mat_src + src_idx + 3 * x - 3);
- uchar src_data_7 = *(mat_src + src_idx + 3 * x - 2);
- uchar src_data_8 = *(mat_src + src_idx + 3 * x - 1);
-
- uchar src_data_9 = *(mat_src + src_idx + 3 * x + 0);
- uchar src_data_10 = *(mat_src + src_idx + 3 * x + 1);
- uchar src_data_11 = *(mat_src + src_idx + 3 * x + 2);
-
- uchar src_data_12 = *(mat_src + src_idx + 3 * x + 3);
- uchar src_data_13 = *(mat_src + src_idx + 3 * x + 4);
- uchar src_data_14 = *(mat_src + src_idx + 3 * x + 5);
-
- uchar src_data_15 = *(mat_src + src_idx + 3 * x + 6);
- uchar src_data_16 = *(mat_src + src_idx + 3 * x + 7);
- uchar src_data_17 = *(mat_src + src_idx + 3 * x + 8);
-
- uchar src_data_18 = *(mat_src + src_idx + 3 * x + 9);
- uchar src_data_19 = *(mat_src + src_idx + 3 * x + 10);
- uchar src_data_20 = *(mat_src + src_idx + 3 * x + 11);
-
- uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
- int index = 3 - dst0_offset & 3;
- tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
-
- uchar4 data0, data1, data2;
-
- data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
- data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
- data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
- tmp_data1 = (dst1_offset & 3) == 0 ? (uchar4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
-
- data0 = (uchar4)(src_data_2, src_data_5, src_data_8, src_data_11);
- data1 = (dst2_offset & 3) == 2 ? (uchar4)(src_data_5, src_data_8, src_data_11, src_data_14) : data0;
- data2 = (dst2_offset & 3) == 1 ? (uchar4)(src_data_8, src_data_11, src_data_14, src_data_17) : data1;
- tmp_data2 = (dst2_offset & 3) == 0 ? (uchar4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
- tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
- tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
- *((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
- }
-}
-
-__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- #define dst0_align ((dst0_offset & 3) << 1)
- #define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
- uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
- uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
- if(src_idx_0 == -6)
- src_data_0.s01234567 = src_data_0.s67012345;
- if(src_idx_0 == -4)
- src_data_0.s01234567 = src_data_0.s45670123;
- if(src_idx_0 == -2)
- src_data_0.s01234567 = src_data_0.s23456701;
- if(src_idx_1 == -6)
- src_data_1.s01234567 = src_data_1.s67012345;
- if(src_idx_1 == -4)
- src_data_1.s01234567 = src_data_1.s45670123;
- if(src_idx_1 == -2)
- src_data_1.s01234567 = src_data_1.s23456701;
-
- uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
- uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
-
- uchar4 tmp_data0, tmp_data1;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
-
- *((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- }
-}
-
-__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- __global char *mat_dst2, int dst2_step, int dst2_offset,
- __global char *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
- int dst3_start = mad24(y, dst3_step, dst3_offset);
- int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
- int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
-
- char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
- char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
- char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
- char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
- char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
- char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
- char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
-
- char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
-
- if((dst0_offset & 3) == 3)
- tmp_data0 = (char4)(data_0.x, data_1.x, data_2.x, data_3.x);
- if((dst0_offset & 3) == 2)
- tmp_data0 = (char4)(data_1.x, data_2.x, data_3.x, data_4.x);
- if((dst0_offset & 3) == 1)
- tmp_data0 = (char4)(data_2.x, data_3.x, data_4.x, data_5.x);
- if((dst0_offset & 3) == 0)
- tmp_data0 = (char4)(data_3.x, data_4.x, data_5.x, data_6.x);
-
- if((dst1_offset & 3) == 3)
- tmp_data1 = (char4)(data_0.y, data_1.y, data_2.y, data_3.y);
- if((dst1_offset & 3) == 2)
- tmp_data1 = (char4)(data_1.y, data_2.y, data_3.y, data_4.y);
- if((dst1_offset & 3) == 1)
- tmp_data1 = (char4)(data_2.y, data_3.y, data_4.y, data_5.y);
- if((dst1_offset & 3) == 0)
- tmp_data1 = (char4)(data_3.y, data_4.y, data_5.y, data_6.y);
-
- if((dst2_offset & 3) == 3)
- tmp_data2 = (char4)(data_0.z, data_1.z, data_2.z, data_3.z);
- if((dst2_offset & 3) == 2)
- tmp_data2 = (char4)(data_1.z, data_2.z, data_3.z, data_4.z);
- if((dst2_offset & 3) == 1)
- tmp_data2 = (char4)(data_2.z, data_3.z, data_4.z, data_5.z);
- if((dst2_offset & 3) == 0)
- tmp_data2 = (char4)(data_3.z, data_4.z, data_5.z, data_6.z);
-
- if((dst3_offset & 3) == 3)
- tmp_data3 = (char4)(data_0.w, data_1.w, data_2.w, data_3.w);
- if((dst3_offset & 3) == 2)
- tmp_data3 = (char4)(data_1.w, data_2.w, data_3.w, data_4.w);
- if((dst3_offset & 3) == 1)
- tmp_data3 = (char4)(data_2.w, data_3.w, data_4.w, data_5.w);
- if((dst3_offset & 3) == 0)
- tmp_data3 = (char4)(data_3.w, data_4.w, data_5.w, data_6.w);
-
- char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
- char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
- char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
- char4 dst3_data = *((__global char4 *)(mat_dst3 + dst3_idx));
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
- tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
- tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
- tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
- tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
- tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
- tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
-
- *((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
- *((__global char4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
- }
-}
-
-__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- __global char *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- int src_idx = mad24(y, src_step, src_offset);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
- char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
- char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
- char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
-
- char4 tmp_data0, tmp_data1, tmp_data2;
-
- char src_data_0 = *(mat_src + src_idx + 3 * x - 9);
- char src_data_1 = *(mat_src + src_idx + 3 * x - 8);
- char src_data_2 = *(mat_src + src_idx + 3 * x - 7);
-
- char src_data_3 = *(mat_src + src_idx + 3 * x - 6);
- char src_data_4 = *(mat_src + src_idx + 3 * x - 5);
- char src_data_5 = *(mat_src + src_idx + 3 * x - 4);
-
- char src_data_6 = *(mat_src + src_idx + 3 * x - 3);
- char src_data_7 = *(mat_src + src_idx + 3 * x - 2);
- char src_data_8 = *(mat_src + src_idx + 3 * x - 1);
-
- char src_data_9 = *(mat_src + src_idx + 3 * x + 0);
- char src_data_10 = *(mat_src + src_idx + 3 * x + 1);
- char src_data_11 = *(mat_src + src_idx + 3 * x + 2);
-
- char src_data_12 = *(mat_src + src_idx + 3 * x + 3);
- char src_data_13 = *(mat_src + src_idx + 3 * x + 4);
- char src_data_14 = *(mat_src + src_idx + 3 * x + 5);
-
- char src_data_15 = *(mat_src + src_idx + 3 * x + 6);
- char src_data_16 = *(mat_src + src_idx + 3 * x + 7);
- char src_data_17 = *(mat_src + src_idx + 3 * x + 8);
-
- char src_data_18 = *(mat_src + src_idx + 3 * x + 9);
- char src_data_19 = *(mat_src + src_idx + 3 * x + 10);
- char src_data_20 = *(mat_src + src_idx + 3 * x + 11);
-
- char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
- int index = 3 - dst0_offset & 3;
- tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
-
- char4 data0, data1, data2;
-
- data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
- data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
- data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
- tmp_data1 = (dst1_offset & 3) == 0 ? (char4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
-
- data0 = (char4)(src_data_2, src_data_5, src_data_8, src_data_11);
- data1 = (dst2_offset & 3) == 2 ? (char4)(src_data_5, src_data_8, src_data_11, src_data_14) : data0;
- data2 = (dst2_offset & 3) == 1 ? (char4)(src_data_8, src_data_11, src_data_14, src_data_17) : data1;
- tmp_data2 = (dst2_offset & 3) == 0 ? (char4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
- tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
- tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
- *((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
- }
-}
-
-__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- #define dst0_align ((dst0_offset & 3) << 1)
- #define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
- char8 src_data_0 = vload8(0, mat_src + src_idx_0);
- char8 src_data_1 = vload8(0, mat_src + src_idx_1);
- if(src_idx_0 == -6)
- src_data_0.s01234567 = src_data_0.s67012345;
- if(src_idx_0 == -4)
- src_data_0.s01234567 = src_data_0.s45670123;
- if(src_idx_0 == -2)
- src_data_0.s01234567 = src_data_0.s23456701;
- if(src_idx_1 == -6)
- src_data_1.s01234567 = src_data_1.s67012345;
- if(src_idx_1 == -4)
- src_data_1.s01234567 = src_data_1.s45670123;
- if(src_idx_1 == -2)
- src_data_1.s01234567 = src_data_1.s23456701;
- char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
- char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
-
- char4 tmp_data0, tmp_data1;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
-
- *((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- }
-}
-
-__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- __global ushort *mat_dst2, int dst2_step, int dst2_offset,
- __global ushort *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
- int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
- int dst3_start = mad24(y, dst3_step, dst3_offset);
- int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
- int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
- if(src_idx_0 == -6)
- src_data0.s01234567 = src_data0.s67012345;
- if(src_idx_0 == -4)
- src_data0.s01234567 = src_data0.s45670123;
- if(src_idx_0 == -2)
- src_data0.s01234567 = src_data0.s23456701;
- ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));
-
- ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
- ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
- ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
- ushort2 dst3_data = *((__global ushort2 *)((__global char *)mat_dst3 + dst3_idx));
-
- ushort2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
-
- tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data0.s4, src_data1.s0) : (ushort2)(src_data0.s0, src_data0.s4);
- tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data0.s5, src_data1.s1) : (ushort2)(src_data0.s1, src_data0.s5);
- tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data0.s6, src_data1.s2) : (ushort2)(src_data0.s2, src_data0.s6);
- tmp_data3 = (dst3_offset & 3) == 0 ? (ushort2)(src_data0.s7, src_data1.s3) : (ushort2)(src_data0.s3, src_data0.s7);
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
- tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
- tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
-
- *((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- *((global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
- *((global ushort2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
- }
-}
-
-__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- __global ushort *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- int src_idx = mad24(y, src_step, src_offset);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
- ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
- ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
- ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
-
- ushort2 tmp_data0, tmp_data1, tmp_data2;
-
- ushort src_data_0 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 3];
- ushort src_data_1 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 2];
- ushort src_data_2 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 1];
- ushort src_data_3 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 0];
- ushort src_data_4 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 1];
- ushort src_data_5 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 2];
- ushort src_data_6 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 3];
- ushort src_data_7 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 4];
- ushort src_data_8 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 5];
-
- tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data_3, src_data_6) : (ushort2)(src_data_0, src_data_3);
- tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data_4, src_data_7) : (ushort2)(src_data_1, src_data_4);
- tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data_5, src_data_8) : (ushort2)(src_data_2, src_data_5);
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
- *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
- }
-}
-
-__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- #define dst0_align ((dst0_offset & 3) << 1)
- #define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
- ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
- ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
- if(src_idx_0 < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
- src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
- }
- if(src_idx_1 < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
- src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
- }
-
- ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
- ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
-
- ushort2 tmp_data0, tmp_data1;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
-
- *((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- }
-}
-__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- __global short *mat_dst2, int dst2_step, int dst2_offset,
- __global short *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
- int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
- int dst3_start = mad24(y, dst3_step, dst3_offset);
- int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
- int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
-
- if(src_idx_0 == -6)
- src_data0.s01234567 = src_data0.s67012345;
- if(src_idx_0 == -4)
- src_data0.s01234567 = src_data0.s45670123;
- if(src_idx_0 == -2)
- src_data0.s01234567 = src_data0.s23456701;
-
- short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
-
- short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
- short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
- short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
- short2 dst3_data = *((__global short2 *)((__global char *)mat_dst3 + dst3_idx));
-
- short2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
-
- tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data0.s4, src_data1.s0) : (short2)(src_data0.s0, src_data0.s4);
- tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data0.s5, src_data1.s1) : (short2)(src_data0.s1, src_data0.s5);
- tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data0.s6, src_data1.s2) : (short2)(src_data0.s2, src_data0.s6);
- tmp_data3 = (dst3_offset & 3) == 0 ? (short2)(src_data0.s7, src_data1.s3) : (short2)(src_data0.s3, src_data0.s7);
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
- tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
- tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
-
- *((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- *((global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
- *((global short2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
- }
-}
-__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- __global short *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- int src_idx = mad24(y, src_step, src_offset);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
+#if DATA_DEPTH == 0
+#define BASE_TYPE uchar
+#elif DATA_DEPTH == 1
+#error data_depth char, use uchar datatype instead
+#elif DATA_DEPTH == 2
+#define BASE_TYPE ushort
+#elif DATA_DEPTH == 3
+#error data_depth short, use ushort datatype instead
+#elif DATA_DEPTH == 4
+#define BASE_TYPE int
+#elif DATA_DEPTH == 5
+#define BASE_TYPE float
+#elif DATA_DEPTH == 6
+#define BASE_TYPE double
+#else
+#error data_depth
+#endif
- short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
- short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
- short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
+#if DATA_CHAN == 2
+#define SRC_VEC_SIZE 2
+#elif DATA_CHAN == 3
+#define SRC_VEC_SIZE 4 // C3 is stored as C4
+#elif DATA_CHAN == 4
+#define SRC_VEC_SIZE 4
+#else
+#error data_chan
+#endif
- short2 tmp_data0, tmp_data1, tmp_data2;
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
- short src_data_0 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 3];
- short src_data_1 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 2];
- short src_data_2 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 1];
- short src_data_3 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 0];
- short src_data_4 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 1];
- short src_data_5 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 2];
- short src_data_6 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 3];
- short src_data_7 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 4];
- short src_data_8 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 5];
+#define uchar1 uchar
+#define char1 char
+#define ushort1 ushort
+#define short1 short
+#define int1 int
+#define float1 float
+#define double1 double
- tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data_3, src_data_6) : (short2)(src_data_0, src_data_3);
- tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data_4, src_data_7) : (short2)(src_data_1, src_data_4);
- tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data_5, src_data_8) : (short2)(src_data_2, src_data_5);
+#define TYPE BASE_TYPE
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
+#define SRC_TYPE CAT(BASE_TYPE, SRC_VEC_SIZE)
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
+#define DST_VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
- *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
- }
-}
+#define vstore1 vstore
+#define VSTORE CAT(vstore, VEC_SIZE)
+#define VSTORE_ALIGNED(ptr, v) *((__global DST_VEC_TYPE*)(ptr)) = (v)
+#define VSTORE_UNALIGNED(ptr, v) VSTORE((v), 0, (__global TYPE*)(ptr))
+#ifdef DST0_ALIGNED
+#define VSTORE_dst0 VSTORE_ALIGNED
+#else
+#define VSTORE_dst0 VSTORE_UNALIGNED
+#endif
+#ifdef DST1_ALIGNED
+#define VSTORE_dst1 VSTORE_ALIGNED
+#else
+#define VSTORE_dst1 VSTORE_UNALIGNED
+#endif
+#ifdef DST2_ALIGNED
+#define VSTORE_dst2 VSTORE_ALIGNED
+#else
+#define VSTORE_dst2 VSTORE_UNALIGNED
+#endif
+#ifdef DST3_ALIGNED
+#define VSTORE_dst3 VSTORE_ALIGNED
+#else
+#define VSTORE_dst3 VSTORE_UNALIGNED
+#endif
-__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
+__kernel void split_vector(
+ __global SRC_TYPE* src, int srcStepBytes, int2 srcOffset, // offset.x in bytes
+ __global TYPE* dst0, int dst0StepBytes, int2 dst0Offset,
+ __global TYPE* dst1, int dst1StepBytes, int2 dst1Offset,
+#if DATA_CHAN > 2
+ __global TYPE* dst2, int dst2StepBytes, int2 dst2Offset,
+#endif
+#if DATA_CHAN > 3
+ __global TYPE* dst3, int dst3StepBytes, int2 dst3Offset,
+#endif
+ int2 size)
{
- int x = get_global_id(0);
+ int x = get_global_id(0) * VEC_SIZE;
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if (x < size.x && y < size.y)
{
- x = x << 1;
-
- #define dst0_align ((dst0_offset & 3) << 1)
- #define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
- short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
- short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
- if(src_idx_0 < 0)
+ SRC_TYPE srcData[VEC_SIZE];
+ int xOffsetLimitBytes = srcOffset.x + size.x * sizeof(SRC_TYPE);
+ int xOffsetBytes = srcOffset.x + x * sizeof(SRC_TYPE);
+ int yOffsetBytes = (srcOffset.y + y) * srcStepBytes;
+#pragma unroll
+ for (int i = 0; i < VEC_SIZE; i++, xOffsetBytes += sizeof(SRC_TYPE))
{
- short4 tmp;
- tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
- src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+ srcData[i] = (xOffsetBytes >= xOffsetLimitBytes) ? (SRC_TYPE)0 :
+ *(__global SRC_TYPE*)((__global char*)src + yOffsetBytes + xOffsetBytes);
}
- if(src_idx_1< 0)
- {
- short4 tmp;
- tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
- src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
- }
-
-
- short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
- short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
-
- short2 tmp_data0, tmp_data1;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
-
- *((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- }
-}
-__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- __global int *mat_dst2, int dst2_step, int dst2_offset,
- __global int *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
- int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
- int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- ((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
- ((__global int *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
- }
-}
-__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- __global int *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
-
- int src_data_0 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 0];
- int src_data_1 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 1];
- int src_data_2 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 2];
-
- ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
- ((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
- ((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
- }
-}
-
-__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
- int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- }
-}
-
-__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- __global float *mat_dst2, int dst2_step, int dst2_offset,
- __global float *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
- int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
- float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- ((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
- ((__global float *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
- }
-}
-
-__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- __global float *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
-
- float src_data_0 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 0];
- float src_data_1 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 1];
- float src_data_2 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 2];
-
- ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
- ((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
- ((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
- }
-}
-
-__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
- float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- __global double *mat_dst2, int dst2_step, int dst2_offset,
- __global double *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
- int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
- double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- ((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
- ((__global double *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
- }
-}
-
-__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- __global double *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
-
- double src_data_0 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 0];
- double src_data_1 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 1];
- double src_data_2 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 2];
-
- ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
- ((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
- ((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
- }
-}
-
-__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
+#if VEC_SIZE == 1
+ TYPE dstC0 = srcData[0].s0;
+ TYPE dstC1 = srcData[0].s1;
+#if DATA_CHAN > 2
+ TYPE dstC2 = srcData[0].s2;
+#endif
+#if DATA_CHAN > 3
+ TYPE dstC3 = srcData[0].s3;
+#endif
+# define VEC_TO_ARRAY(v, a) TYPE a[1] = {v};
+#elif VEC_SIZE == 2
+ DST_VEC_TYPE dstC0 = (DST_VEC_TYPE)(srcData[0].s0, srcData[1].s0);
+ DST_VEC_TYPE dstC1 = (DST_VEC_TYPE)(srcData[0].s1, srcData[1].s1);
+#if DATA_CHAN > 2
+ DST_VEC_TYPE dstC2 = (DST_VEC_TYPE)(srcData[0].s2, srcData[1].s2);
+#endif
+#if DATA_CHAN > 3
+ DST_VEC_TYPE dstC3 = (DST_VEC_TYPE)(srcData[0].s3, srcData[1].s3);
+#endif
+# define VEC_TO_ARRAY(v, a) TYPE a[2] = {v.s0, v.s1};
+#elif VEC_SIZE == 4
+ DST_VEC_TYPE dstC0 = (DST_VEC_TYPE)(srcData[0].s0, srcData[1].s0, srcData[2].s0, srcData[3].s0);
+ DST_VEC_TYPE dstC1 = (DST_VEC_TYPE)(srcData[0].s1, srcData[1].s1, srcData[2].s1, srcData[3].s1);
+#if DATA_CHAN > 2
+ DST_VEC_TYPE dstC2 = (DST_VEC_TYPE)(srcData[0].s2, srcData[1].s2, srcData[2].s2, srcData[3].s2);
+#endif
+#if DATA_CHAN > 3
+ DST_VEC_TYPE dstC3 = (DST_VEC_TYPE)(srcData[0].s3, srcData[1].s3, srcData[2].s3, srcData[3].s3);
+#endif
+# define VEC_TO_ARRAY(v, a) TYPE a[4] = {v.s0, v.s1, v.s2, v.s3};
+#endif
- double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
+#ifndef BYPASS_VSTORE
+#define BYPASS_VSTORE false
+#endif
- ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
+#define WRITE_VEC_DST(dst, vecValue) \
+{ \
+ int dst ## xOffsetLimitBytes = dst ## Offset.x + size.x * sizeof(TYPE); \
+ int dst ## xOffsetBytes = dst ## Offset.x + x * sizeof(TYPE); \
+ int dst ## yOffsetBytes = (dst ## Offset.y + y) * dst ## StepBytes; \
+ if (!BYPASS_VSTORE && dst ## xOffsetBytes + sizeof(DST_VEC_TYPE) <= dst ## xOffsetLimitBytes) \
+ { \
+ VSTORE_ ## dst(((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes), vecValue); \
+ } \
+ else \
+ { \
+ VEC_TO_ARRAY(vecValue, vecValue##Array); \
+ for (int i = 0; i < VEC_SIZE; i++, dst ## xOffsetBytes += sizeof(TYPE)) \
+ { \
+ if (dst ## xOffsetBytes + sizeof(TYPE) <= dst ## xOffsetLimitBytes) \
+ *(__global TYPE*)((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes) = vecValue##Array[i]; \
+ else \
+ break; \
+ } \
+ } \
+}
+
+ WRITE_VEC_DST(dst0, dstC0);
+ WRITE_VEC_DST(dst1, dstC1);
+#if DATA_CHAN > 2
+ WRITE_VEC_DST(dst2, dstC2);
+#endif
+#if DATA_CHAN > 3
+ WRITE_VEC_DST(dst3, dstC3);
+#endif
}
}
-#endif