From da6fdcd1e21d710c5253417634a9b23543ea0271 Mon Sep 17 00:00:00 2001 From: Junyan He Date: Wed, 26 Mar 2014 18:27:48 +0800 Subject: [PATCH] Add three copy cl files for Enqueue Copy usage. Add these three cl files, one for src and dst are not aligned but have same offset to 4. second for src's %4 offset is bigger than the dst's third for src's %4 offset is small than the dst's Signed-off-by: Junyan He Reviewed-by: Zhigang Gong --- src/CMakeLists.txt | 4 ++- .../cl_internal_copy_buf_unalign_dst_offset.cl | 28 +++++++++++++++++++++ .../cl_internal_copy_buf_unalign_same_offset.cl | 19 ++++++++++++++ .../cl_internal_copy_buf_unalign_src_offset.cl | 29 ++++++++++++++++++++++ 4 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl create mode 100644 src/kernels/cl_internal_copy_buf_unalign_same_offset.cl create mode 100644 src/kernels/cl_internal_copy_buf_unalign_src_offset.cl diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4c34235..d690d9a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,7 +18,9 @@ endforeach (KF) endmacro (MakeKernelBinStr) set (KERNEL_STR_FILES) -set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 cl_internal_copy_buf_align16) +set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 +cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset +cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset) MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}") set(OPENCL_SRC diff --git a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl new file mode 100644 index 0000000..13f4162 --- /dev/null +++ b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl @@ -0,0 +1,28 @@ +kernel void __cl_cpy_region_unalign_dst_offset ( global int* src, unsigned int src_offset, + global int* dst, unsigned int dst_offset, + unsigned int size, + unsigned int first_mask, unsigned int last_mask, + unsigned int shift, unsigned int dw_mask) +{ + int i = get_global_id(0); + unsigned int tmp = 0; + + if (i > size -1) + return; + + /* last dw, need to be careful, not to overflow the source. */ + if ((i == size - 1) && ((last_mask & (~(~dw_mask >> shift))) == 0)) { + tmp = ((src[src_offset + i] & ~dw_mask) >> shift); + } else { + tmp = ((src[src_offset + i] & ~dw_mask) >> shift) + | ((src[src_offset + i + 1] & dw_mask) << (32 - shift)); + } + + if (i == 0) { + dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask)); + } else if (i == size - 1) { + dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask)); + } else { + dst[i+dst_offset] = tmp; + } +} diff --git a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl new file mode 100644 index 0000000..8510246 --- /dev/null +++ b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl @@ -0,0 +1,19 @@ +kernel void __cl_cpy_region_unalign_same_offset ( global int* src, unsigned int src_offset, + global int* dst, unsigned int dst_offset, + unsigned int size, + unsigned int first_mask, unsigned int last_mask) +{ + int i = get_global_id(0); + if (i > size -1) + return; + + if (i == 0) { + dst[dst_offset] = (dst[dst_offset] & first_mask) + | (src[src_offset] & (~first_mask)); + } else if (i == size - 1) { + dst[i+dst_offset] = (src[i+src_offset] & last_mask) + | (dst[i+dst_offset] & (~last_mask)); + } else { + dst[i+dst_offset] = src[i+src_offset]; + } +} diff --git a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl new file mode 100644 index 0000000..f98368a --- /dev/null +++ b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl @@ -0,0 +1,29 @@ +kernel void __cl_cpy_region_unalign_src_offset ( global int* src, unsigned int src_offset, + global int* dst, unsigned int dst_offset, + unsigned int size, + unsigned int first_mask, unsigned int last_mask, + unsigned int shift, unsigned int dw_mask, int src_less) +{ + int i = get_global_id(0); + unsigned int tmp = 0; + + if (i > size -1) + return; + + if (i == 0) { + tmp = ((src[src_offset + i] & dw_mask) << shift); + } else if (src_less && i == size - 1) { // not exceed the bound of source + tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift)); + } else { + tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift)) + | ((src[src_offset + i] & dw_mask) << shift); + } + + if (i == 0) { + dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask)); + } else if (i == size - 1) { + dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask)); + } else { + dst[i+dst_offset] = tmp; + } +} -- 2.7.4