Add three copy cl files for Enqueue Copy usage.
authorJunyan He <junyan.he@linux.intel.com>
Wed, 26 Mar 2014 10:27:48 +0000 (18:27 +0800)
committerZhigang Gong <zhigang.gong@intel.com>
Wed, 2 Apr 2014 05:06:55 +0000 (13:06 +0800)
Add these three cl files,
one for src and dst are not aligned but have same offset to 4.
second for src's %4 offset is bigger than the dst's
third for src's %4 offset is small than the dst's

Signed-off-by: Junyan He <junyan.he@linux.intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
src/CMakeLists.txt
src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl [new file with mode: 0644]
src/kernels/cl_internal_copy_buf_unalign_same_offset.cl [new file with mode: 0644]
src/kernels/cl_internal_copy_buf_unalign_src_offset.cl [new file with mode: 0644]

index 4c34235..d690d9a 100644 (file)
@@ -18,7 +18,9 @@ endforeach (KF)
 endmacro (MakeKernelBinStr)
 
 set (KERNEL_STR_FILES)
-set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4 cl_internal_copy_buf_align16)
+set (KERNEL_NAMES cl_internal_copy_buf_align1 cl_internal_copy_buf_align4
+cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
+cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset)
 MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
 
 set(OPENCL_SRC
diff --git a/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
new file mode 100644 (file)
index 0000000..13f4162
--- /dev/null
@@ -0,0 +1,28 @@
+kernel void __cl_cpy_region_unalign_dst_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+                                    unsigned int size,
+                                    unsigned int first_mask, unsigned int last_mask,
+                                    unsigned int shift, unsigned int dw_mask)
+{
+    int i = get_global_id(0);
+    unsigned int tmp = 0;
+
+    if (i > size -1)
+        return;
+
+    /* last dw, need to be careful, not to overflow the source. */
+    if ((i == size - 1) && ((last_mask & (~(~dw_mask >> shift))) == 0)) {
+        tmp = ((src[src_offset + i] & ~dw_mask) >> shift);
+    } else {
+        tmp = ((src[src_offset + i] & ~dw_mask) >> shift)
+             | ((src[src_offset + i + 1] & dw_mask) << (32 - shift));
+    }
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = tmp;
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_same_offset.cl
new file mode 100644 (file)
index 0000000..8510246
--- /dev/null
@@ -0,0 +1,19 @@
+kernel void __cl_cpy_region_unalign_same_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+                                    unsigned int size,
+                                    unsigned int first_mask, unsigned int last_mask)
+{
+    int i = get_global_id(0);
+    if (i > size -1)
+       return;
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask)
+             | (src[src_offset] & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (src[i+src_offset] & last_mask)
+            | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = src[i+src_offset];
+    }
+}
diff --git a/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl b/src/kernels/cl_internal_copy_buf_unalign_src_offset.cl
new file mode 100644 (file)
index 0000000..f98368a
--- /dev/null
@@ -0,0 +1,29 @@
+kernel void __cl_cpy_region_unalign_src_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+                                    unsigned int size,
+                                    unsigned int first_mask, unsigned int last_mask,
+                                    unsigned int shift, unsigned int dw_mask, int src_less)
+{
+    int i = get_global_id(0);
+    unsigned int tmp = 0;
+
+    if (i > size -1)
+        return;
+
+    if (i == 0) {
+        tmp = ((src[src_offset + i] & dw_mask) << shift);
+    } else if (src_less && i == size - 1) { // not exceed the bound of source
+        tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift));
+    } else {
+        tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift))
+             | ((src[src_offset + i] & dw_mask) << shift);
+    }
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = tmp;
+    }
+}