Allow OpenCL acceleration in every OpenCV module

author Andrey Kamaev <andrey.kamaev@itseez.com>

Fri, 15 Mar 2013 19:56:31 +0000 (23:56 +0400)

committer Andrey Kamaev <andrey.kamaev@itseez.com>

Thu, 21 Mar 2013 13:57:01 +0000 (17:57 +0400)
author Andrey Kamaev <andrey.kamaev@itseez.com>
Fri, 15 Mar 2013 19:56:31 +0000 (23:56 +0400)
committer Andrey Kamaev <andrey.kamaev@itseez.com>
Thu, 21 Mar 2013 13:57:01 +0000 (17:57 +0400)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 6657de2..351273e 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -782,7 +782,7 @@ if(HAVE_CUDA)
    status("    Use fast math:"        CUDA_FAST_MATH THEN YES ELSE NO)
  endif()
  
-if(HAVE_OPENCL AND BUILD_opencv_ocl)
+if(HAVE_OPENCL)
    status("")
    status("  OpenCL")
    if(OPENCL_INCLUDE_DIR)
diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake

index b6d129a..abb0393 100644 (file)
--- a/cmake/OpenCVModule.cmake
+++ b/cmake/OpenCVModule.cmake
@@ -432,10 +432,22 @@ macro(ocv_glob_module_sources)
    file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
    file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
  
+  file(GLOB cl_kernels "src/opencl/*.cl")
+
    source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
    source_group("Include" FILES ${lib_hdrs})
    source_group("Include\\detail" FILES ${lib_hdrs_detail})
  
+  if(HAVE_OPENCL AND cl_kernels)
+    ocv_include_directories(${OPENCL_INCLUDE_DIRS})
+    add_custom_command(
+      OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp"
+      COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
+      DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
+    source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
+    list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
+  endif()
+
    ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs})
  endmacro()
  
@@ -449,6 +461,9 @@ macro(ocv_create_module)
  
    if(NOT "${ARGN}" STREQUAL "SKIP_LINK")
      target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
+    if(HAVE_OPENCL AND OPENCL_LIBRARIES)
+      target_link_libraries(${the_module} ${OPENCL_LIBRARIES})
+    endif()
    endif()
  
    add_dependencies(opencv_modules ${the_module})
diff --git a/modules/ocl/cl2cpp.cmake b/cmake/cl2cpp.cmake

similarity index 100%

rename from modules/ocl/cl2cpp.cmake

rename to cmake/cl2cpp.cmake
diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt

index 7e621f4..8dbe90c 100644 (file)
--- a/modules/ocl/CMakeLists.txt
+++ b/modules/ocl/CMakeLists.txt
@@ -3,45 +3,5 @@ if(NOT HAVE_OPENCL)
  endif()
  
  set(the_description "OpenCL-accelerated Computer Vision")
-ocv_add_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree)
-ocv_module_include_directories(${OPENCL_INCLUDE_DIRS})
-
-file(GLOB CL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl")
-set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
-set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.cmake")
-
-add_custom_command(
-  OUTPUT ${kernels_cpp}
-  COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" -DOUTPUT="${kernels_cpp}" -P ${cl2cpp_script}
-  DEPENDS ${CL_FILES} ${cl2cpp_script})
-
-file(GLOB lib_hdrs     "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
-file(GLOB lib_srcs     "src/*.cpp")
-file(GLOB lib_int_hdrs "src/*.h*")
-
-source_group("Include"   FILES ${lib_hdrs})
-source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs} ${kernels_cpp})
-
+ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree)
  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
-
-ocv_set_module_sources(HEADERS ${lib_hdrs} SOURCES ${lib_int_hdrs} ${lib_srcs} ${kernels_cpp})
-ocv_create_module(${OPENCL_LIBRARIES})
-ocv_add_precompiled_headers(${the_module})
-
-################################################################################################################
-################################      OpenCL Module Tests     ##################################################
-################################################################################################################
-file(GLOB test_srcs "test/*.cpp")
-file(GLOB test_hdrs "test/*.hpp" "test/*.h")
-
-ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
-                       FILES "Src" ${test_srcs})
-
-################################################################################################################
-################################   OpenCL Module Performance  ##################################################
-################################################################################################################
-file(GLOB perf_srcs "perf/*.cpp")
-file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
-
-ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
-                   FILES "Src" ${perf_srcs})
diff --git a/modules/ocl/src/kernels/brute_force_match.cl b/modules/ocl/src/kernels/brute_force_match.cl

deleted file mode 100644 (file)

index e5dd29e..0000000
--- a/modules/ocl/src/kernels/brute_force_match.cl
+++ /dev/null
@@ -1,865 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#define MAX_FLOAT 1e7f
-
-int bit1Count(float x)
-{
-       int c = 0;
-       int ix = (int)x;
-       
-       for (int i = 0 ; i < 32 ; i++)
-       {
-               c += ix & 0x1;
-               ix >>= 1;
-       }
-       
-       return (float)c;
-}
-/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
-local size: dim0 is block_size, dim1 is block_size.
-*/
-__kernel void BruteForceMatch_UnrollMatch(
-    __global float *query,
-    __global float *train,
-    //__global float *mask,
-    __global int *bestTrainIdx,
-    __global float *bestDistance,
-    __local float *sharebuffer,
-    int block_size,
-    int max_desc_len,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step,
-    int distType
-)
-{
-       const int lidx = get_local_id(0);
-       const int lidy = get_local_id(1);
-       const int groupidx = get_group_id(0);
-       
-       __local float *s_query = sharebuffer;
-       __local float *s_train = sharebuffer + block_size * max_desc_len;
-       
-       int queryIdx = groupidx * block_size + lidy;
-       
-       // load the query into local memory.
-       for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
-       {
-               int loadx = lidx + i * block_size;
-               s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-       }
-       
-       float myBestDistance = MAX_FLOAT;
-       int myBestTrainIdx = -1;
-       
-       // loopUnrolledCached to find the best trainIdx and best distance.
-       volatile int imgIdx = 0;
-       
-       for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
-       {
-               float result = 0;
-               
-               for (int i = 0 ; i < max_desc_len / block_size ; i++)
-               {
-                       //load a block_size * block_size block into local train.
-                       const int loadx = lidx + i * block_size;
-                       s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
-                       
-                       //synchronize to make sure each elem for reduceIteration in share memory is written already.
-                       barrier(CLK_LOCAL_MEM_FENCE);
-                       
-                       /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-                       sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-                       
-                       switch (distType)
-                       {
-                               case 0:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                                       }
-                                       
-                                       break;
-                               case 1:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                                               result += qr * qr;
-                                       }
-                                       
-                                       break;
-                               case 2:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                                               result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                                       }
-                                       
-                                       break;
-                       }
-                       
-                       barrier(CLK_LOCAL_MEM_FENCE);
-               }
-               
-               int trainIdx = t * block_size + lidx;
-               
-               if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
-               {
-                       //bestImgIdx = imgIdx;
-                       myBestDistance = result;
-                       myBestTrainIdx = trainIdx;
-               }
-       }
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       __local float *s_distance = (__local float *)(sharebuffer);
-       __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
-       
-       //find BestMatch
-       s_distance += lidy * block_size;
-       s_trainIdx += lidy * block_size;
-       s_distance[lidx] = myBestDistance;
-       s_trainIdx[lidx] = myBestTrainIdx;
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       //reduce -- now all reduce implement in each threads.
-       for (int k = 0 ; k < block_size; k++)
-       {
-               if (myBestDistance > s_distance[k])
-               {
-                       myBestDistance = s_distance[k];
-                       myBestTrainIdx = s_trainIdx[k];
-               }
-       }
-       
-       if (queryIdx < query_rows && lidx == 0)
-       {
-               bestTrainIdx[queryIdx] = myBestTrainIdx;
-               bestDistance[queryIdx] = myBestDistance;
-       }
-}
-
-__kernel void BruteForceMatch_Match(
-    __global float *query,
-    __global float *train,
-    //__global float *mask,
-    __global int *bestTrainIdx,
-    __global float *bestDistance,
-    __local float *sharebuffer,
-    int block_size,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step,
-    int distType
-)
-{
-       const int lidx = get_local_id(0);
-       const int lidy = get_local_id(1);
-       const int groupidx = get_group_id(0);
-       
-       const int queryIdx = groupidx * block_size + lidy;
-       
-       float myBestDistance = MAX_FLOAT;
-       int myBestTrainIdx = -1;
-       
-       __local float *s_query = sharebuffer;
-       __local float *s_train = sharebuffer + block_size * block_size;
-       
-       // loop
-       for (int t = 0 ;  t < (train_rows + block_size - 1) / block_size ; t++)
-       {
-               //Dist dist;
-               float result = 0;
-               
-               for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
-               {
-                       const int loadx = lidx + i * block_size;
-                       //load query and train into local memory
-                       s_query[lidy * block_size + lidx] = 0;
-                       s_train[lidx * block_size + lidy] = 0;
-                       
-                       if (loadx < query_cols)
-                       {
-                               s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
-                               s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
-                       }
-                       
-                       barrier(CLK_LOCAL_MEM_FENCE);
-                       
-                       /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-                       sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-                       
-                       switch (distType)
-                       {
-                               case 0:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                                       }
-                                       
-                                       break;
-                               case 1:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                                               result += qr * qr;
-                                       }
-                                       
-                                       break;
-                               case 2:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                                               result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
-                                       }
-                                       
-                                       break;
-                       }
-                       
-                       barrier(CLK_LOCAL_MEM_FENCE);
-               }
-               
-               const int trainIdx = t * block_size + lidx;
-               
-               if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
-               {
-                       //myBestImgidx = imgIdx;
-                       myBestDistance = result;
-                       myBestTrainIdx = trainIdx;
-               }
-       }
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       __local float *s_distance = (__local float *)sharebuffer;
-       __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
-       
-       //findBestMatch
-       s_distance += lidy * block_size;
-       s_trainIdx += lidy * block_size;
-       s_distance[lidx] = myBestDistance;
-       s_trainIdx[lidx] = myBestTrainIdx;
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       //reduce -- now all reduce implement in each threads.
-       for (int k = 0 ; k < block_size; k++)
-       {
-               if (myBestDistance > s_distance[k])
-               {
-                       myBestDistance = s_distance[k];
-                       myBestTrainIdx = s_trainIdx[k];
-               }
-       }
-       
-       if (queryIdx < query_rows && lidx == 0)
-       {
-               bestTrainIdx[queryIdx] = myBestTrainIdx;
-               bestDistance[queryIdx] = myBestDistance;
-       }
-}
-
-//radius_unrollmatch
-__kernel void BruteForceMatch_RadiusUnrollMatch(
-    __global float *query,
-    __global float *train,
-    float maxDistance,
-    //__global float *mask,
-    __global int *bestTrainIdx,
-    __global float *bestDistance,
-    __global int *nMatches,
-    __local float *sharebuffer,
-    int block_size,
-    int max_desc_len,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int bestTrainIdx_cols,
-    int step,
-    int ostep,
-    int distType
-)
-{
-       const int lidx = get_local_id(0);
-       const int lidy = get_local_id(1);
-       const int groupidx = get_group_id(0);
-       const int groupidy = get_group_id(1);
-       
-       const int queryIdx = groupidy * block_size + lidy;
-       const int trainIdx = groupidx * block_size + lidx;
-       
-       __local float *s_query = sharebuffer;
-       __local float *s_train = sharebuffer + block_size * block_size;
-       
-       float result = 0;
-       
-       for (int i = 0 ; i < max_desc_len / block_size ; ++i)
-       {
-               //load a block_size * block_size block into local train.
-               const int loadx = lidx + i * block_size;
-               
-               s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-               s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-               
-               //synchronize to make sure each elem for reduceIteration in share memory is written already.
-               barrier(CLK_LOCAL_MEM_FENCE);
-               
-               /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-               sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-               
-               switch (distType)
-               {
-                       case 0:
-                       
-                               for (int j = 0 ; j < block_size ; ++j)
-                               {
-                                       result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-                               }
-                               
-                               break;
-                       case 1:
-                       
-                               for (int j = 0 ; j < block_size ; ++j)
-                               {
-                                       float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                                       result += qr * qr;
-                               }
-                               
-                               break;
-                       case 2:
-                       
-                               for (int j = 0 ; j < block_size ; ++j)
-                               {
-                                       result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                               }
-                               
-                               break;
-               }
-               
-               barrier(CLK_LOCAL_MEM_FENCE);
-       }
-       
-       if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
-       {
-               unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
-               
-               if (ind < bestTrainIdx_cols)
-               {
-                       //bestImgIdx = imgIdx;
-                       bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
-                       bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
-               }
-       }
-}
-
-//radius_match
-__kernel void BruteForceMatch_RadiusMatch(
-    __global float *query,
-    __global float *train,
-    float maxDistance,
-    //__global float *mask,
-    __global int *bestTrainIdx,
-    __global float *bestDistance,
-    __global int *nMatches,
-    __local float *sharebuffer,
-    int block_size,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int bestTrainIdx_cols,
-    int step,
-    int ostep,
-    int distType
-)
-{
-       const int lidx = get_local_id(0);
-       const int lidy = get_local_id(1);
-       const int groupidx = get_group_id(0);
-       const int groupidy = get_group_id(1);
-       
-       const int queryIdx = groupidy * block_size + lidy;
-       const int trainIdx = groupidx * block_size + lidx;
-       
-       __local float *s_query = sharebuffer;
-       __local float *s_train = sharebuffer + block_size * block_size;
-       
-       float result = 0;
-       
-       for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
-       {
-               //load a block_size * block_size block into local train.
-               const int loadx = lidx + i * block_size;
-               
-               s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-               s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-               
-               //synchronize to make sure each elem for reduceIteration in share memory is written already.
-               barrier(CLK_LOCAL_MEM_FENCE);
-               
-               /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-               sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-               
-               switch (distType)
-               {
-                       case 0:
-                       
-                               for (int j = 0 ; j < block_size ; ++j)
-                               {
-                                       result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
-                               }
-                               
-                               break;
-                       case 1:
-                       
-                               for (int j = 0 ; j < block_size ; ++j)
-                               {
-                                       float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
-                                       result += qr * qr;
-                               }
-                               
-                               break;
-                       case 2:
-                       
-                               for (int j = 0 ; j < block_size ; ++j)
-                               {
-                                       result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                               }
-                               
-                               break;
-               }
-               
-               barrier(CLK_LOCAL_MEM_FENCE);
-       }
-       
-       if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
-       {
-               unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
-               
-               if (ind < bestTrainIdx_cols)
-               {
-                       //bestImgIdx = imgIdx;
-                       bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
-                       bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
-               }
-       }
-}
-
-
-__kernel void BruteForceMatch_knnUnrollMatch(
-    __global float *query,
-    __global float *train,
-    //__global float *mask,
-    __global int2 *bestTrainIdx,
-    __global float2 *bestDistance,
-    __local float *sharebuffer,
-    int block_size,
-    int max_desc_len,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step,
-    int distType
-)
-{
-       const int lidx = get_local_id(0);
-       const int lidy = get_local_id(1);
-       const int groupidx = get_group_id(0);
-       
-       const int queryIdx = groupidx * block_size + lidy;
-       local float *s_query = sharebuffer;
-       local float *s_train = sharebuffer + block_size * max_desc_len;
-       
-       // load the query into local memory.
-       for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
-       {
-               int loadx = lidx + i * block_size;
-               s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
-       }
-       
-       float myBestDistance1 = MAX_FLOAT;
-       float myBestDistance2 = MAX_FLOAT;
-       int myBestTrainIdx1 = -1;
-       int myBestTrainIdx2 = -1;
-       
-       //loopUnrolledCached
-       volatile int imgIdx = 0;
-       
-       for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
-       {
-               float result = 0;
-               
-               for (int i = 0 ; i < max_desc_len / block_size ; i++)
-               {
-                       const int loadX = lidx + i * block_size;
-                       //load a block_size * block_size block into local train.
-                       const int loadx = lidx + i * block_size;
-                       s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
-                       
-                       //synchronize to make sure each elem for reduceIteration in share memory is written already.
-                       barrier(CLK_LOCAL_MEM_FENCE);
-                       
-                       /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-                       sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-                       
-                       switch (distType)
-                       {
-                               case 0:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
-                                       }
-                                       
-                                       break;
-                               case 1:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
-                                               result += qr * qr;
-                                       }
-                                       
-                                       break;
-                               case 2:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                                               result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
-                                       }
-                                       
-                                       break;
-                       }
-                       
-                       barrier(CLK_LOCAL_MEM_FENCE);
-               }
-               
-               const int trainIdx = t * block_size + lidx;
-               
-               if (queryIdx < query_rows && trainIdx < train_rows)
-               {
-                       if (result < myBestDistance1)
-                       {
-                               myBestDistance2 = myBestDistance1;
-                               myBestTrainIdx2 = myBestTrainIdx1;
-                               myBestDistance1 = result;
-                               myBestTrainIdx1 = trainIdx;
-                       }
-                       else if (result < myBestDistance2)
-                       {
-                               myBestDistance2 = result;
-                               myBestTrainIdx2 = trainIdx;
-                       }
-               }
-       }
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       local float *s_distance = (local float *)sharebuffer;
-       local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
-       
-       // find BestMatch
-       s_distance += lidy * block_size;
-       s_trainIdx += lidy * block_size;
-       
-       s_distance[lidx] = myBestDistance1;
-       s_trainIdx[lidx] = myBestTrainIdx1;
-       
-       float bestDistance1 = MAX_FLOAT;
-       float bestDistance2 = MAX_FLOAT;
-       int bestTrainIdx1 = -1;
-       int bestTrainIdx2 = -1;
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       if (lidx == 0)
-       {
-               for (int i = 0 ; i < block_size ; i++)
-               {
-                       float val = s_distance[i];
-                       
-                       if (val < bestDistance1)
-                       {
-                               bestDistance2 = bestDistance1;
-                               bestTrainIdx2 = bestTrainIdx1;
-                               
-                               bestDistance1 = val;
-                               bestTrainIdx1 = s_trainIdx[i];
-                       }
-                       else if (val < bestDistance2)
-                       {
-                               bestDistance2 = val;
-                               bestTrainIdx2 = s_trainIdx[i];
-                       }
-               }
-       }
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       s_distance[lidx] = myBestDistance2;
-       s_trainIdx[lidx] = myBestTrainIdx2;
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       if (lidx == 0)
-       {
-               for (int i = 0 ; i < block_size ; i++)
-               {
-                       float val = s_distance[i];
-                       
-                       if (val < bestDistance2)
-                       {
-                               bestDistance2 = val;
-                               bestTrainIdx2 = s_trainIdx[i];
-                       }
-               }
-       }
-       
-       myBestDistance1 = bestDistance1;
-       myBestDistance2 = bestDistance2;
-       
-       myBestTrainIdx1 = bestTrainIdx1;
-       myBestTrainIdx2 = bestTrainIdx2;
-       
-       if (queryIdx < query_rows && lidx == 0)
-       {
-               bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
-               bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
-       }
-}
-
-__kernel void BruteForceMatch_knnMatch(
-    __global float *query,
-    __global float *train,
-    //__global float *mask,
-    __global int2 *bestTrainIdx,
-    __global float2 *bestDistance,
-    __local float *sharebuffer,
-    int block_size,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step,
-    int distType
-)
-{
-       const int lidx = get_local_id(0);
-       const int lidy = get_local_id(1);
-       const int groupidx = get_group_id(0);
-       
-       const int queryIdx = groupidx * block_size + lidy;
-       local float *s_query = sharebuffer;
-       local float *s_train = sharebuffer + block_size * block_size;
-       
-       float myBestDistance1 = MAX_FLOAT;
-       float myBestDistance2 = MAX_FLOAT;
-       int myBestTrainIdx1 = -1;
-       int myBestTrainIdx2 = -1;
-       
-       //loop
-       for (int  t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
-       {
-               float result = 0.0f;
-               
-               for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
-               {
-                       const int loadx = lidx + i * block_size;
-                       //load query and train into local memory
-                       s_query[lidy * block_size + lidx] = 0;
-                       s_train[lidx * block_size + lidy] = 0;
-                       
-                       if (loadx < query_cols)
-                       {
-                               s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
-                               s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
-                       }
-                       
-                       barrier(CLK_LOCAL_MEM_FENCE);
-                       
-                       /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
-                       sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-                       
-                       switch (distType)
-                       {
-                               case 0:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
-                                       }
-                                       
-                                       break;
-                               case 1:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
-                                               result += qr * qr;
-                                       }
-                                       
-                                       break;
-                               case 2:
-                               
-                                       for (int j = 0 ; j < block_size ; j++)
-                                       {
-                                               //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
-                                               result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
-                                       }
-                                       
-                                       break;
-                       }
-                       
-                       barrier(CLK_LOCAL_MEM_FENCE);
-               }
-               
-               const int trainIdx = t * block_size + lidx;
-               
-               if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
-               {
-                       if (result < myBestDistance1)
-                       {
-                               myBestDistance2 = myBestDistance1;
-                               myBestTrainIdx2 = myBestTrainIdx1;
-                               myBestDistance1 = result;
-                               myBestTrainIdx1 = trainIdx;
-                       }
-                       else if (result < myBestDistance2)
-                       {
-                               myBestDistance2 = result;
-                               myBestTrainIdx2 = trainIdx;
-                       }
-               }
-       }
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       __local float *s_distance = (__local float *)sharebuffer;
-       __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
-       
-       //findBestMatch
-       s_distance += lidy * block_size;
-       s_trainIdx += lidy * block_size;
-       
-       s_distance[lidx] = myBestDistance1;
-       s_trainIdx[lidx] = myBestTrainIdx1;
-       
-       float bestDistance1 = MAX_FLOAT;
-       float bestDistance2 = MAX_FLOAT;
-       int bestTrainIdx1 = -1;
-       int bestTrainIdx2 = -1;
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       if (lidx == 0)
-       {
-               for (int i = 0 ; i < block_size ; i++)
-               {
-                       float val = s_distance[i];
-                       
-                       if (val < bestDistance1)
-                       {
-                               bestDistance2 = bestDistance1;
-                               bestTrainIdx2 = bestTrainIdx1;
-                               
-                               bestDistance1 = val;
-                               bestTrainIdx1 = s_trainIdx[i];
-                       }
-                       else if (val < bestDistance2)
-                       {
-                               bestDistance2 = val;
-                               bestTrainIdx2 = s_trainIdx[i];
-                       }
-               }
-       }
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       s_distance[lidx] = myBestDistance2;
-       s_trainIdx[lidx] = myBestTrainIdx2;
-       
-       barrier(CLK_LOCAL_MEM_FENCE);
-       
-       if (lidx == 0)
-       {
-               for (int i = 0 ; i < block_size ; i++)
-               {
-                       float val = s_distance[i];
-                       
-                       if (val < bestDistance2)
-                       {
-                               bestDistance2 = val;
-                               bestTrainIdx2 = s_trainIdx[i];
-                       }
-               }
-       }
-       
-       myBestDistance1 = bestDistance1;
-       myBestDistance2 = bestDistance2;
-       
-       myBestTrainIdx1 = bestTrainIdx1;
-       myBestTrainIdx2 = bestTrainIdx2;
-       
-       if (queryIdx < query_rows && lidx == 0)
-       {
-               bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
-               bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
-       }
-}
-
-kernel void BruteForceMatch_calcDistanceUnrolled(
-    __global float *query,
-    __global float *train,
-    //__global float *mask,
-    __global float *allDist,
-    __local float *sharebuffer,
-    int block_size,
-    int max_desc_len,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step,
-    int distType)
-{
-       /* Todo */
-}
-
-kernel void BruteForceMatch_calcDistance(
-    __global float *query,
-    __global float *train,
-    //__global float *mask,
-    __global float *allDist,
-    __local float *sharebuffer,
-    int block_size,
-    int query_rows,
-    int query_cols,
-    int train_rows,
-    int train_cols,
-    int step,
-    int distType)
-{
-       /* Todo */
-}
-
-kernel void BruteForceMatch_findBestMatch(
-    __global float *allDist,
-    __global int *bestTrainIdx,
-    __global float *bestDistance,
-    int k,
-    int block_size
-)
-{
-       /* Todo */
-}
-\ No newline at end of file
diff --git a/modules/ocl/src/kernels/arithm_2_mat.cl b/modules/ocl/src/opencl/arithm_2_mat.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_2_mat.cl

rename to modules/ocl/src/opencl/arithm_2_mat.cl
diff --git a/modules/ocl/src/kernels/arithm_LUT.cl b/modules/ocl/src/opencl/arithm_LUT.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_LUT.cl

rename to modules/ocl/src/opencl/arithm_LUT.cl
diff --git a/modules/ocl/src/kernels/arithm_absdiff.cl b/modules/ocl/src/opencl/arithm_absdiff.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_absdiff.cl

rename to modules/ocl/src/opencl/arithm_absdiff.cl
diff --git a/modules/ocl/src/kernels/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_add.cl

rename to modules/ocl/src/opencl/arithm_add.cl
diff --git a/modules/ocl/src/kernels/arithm_addWeighted.cl b/modules/ocl/src/opencl/arithm_addWeighted.cl

similarity index 95%

rename from modules/ocl/src/kernels/arithm_addWeighted.cl

rename to modules/ocl/src/opencl/arithm_addWeighted.cl

index 7e9df6f..d76f994 100644 (file)
--- a/modules/ocl/src/kernels/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@@ -61,29 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
      int y = get_global_id(1);
  
      if (x < cols && y < rows)
-    
+
      {
  
          x = x << 2;
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               uchar4 src1_data ,src2_data;
+        uchar4 src1_data ,src2_data;
  
-               src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
-               src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
-               src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
-               src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
+        src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
+        src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
+        src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
+        src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
  
-               src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
-               src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
-               src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
-               src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
+        src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
+        src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
+        src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
+        src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
  //        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
@@ -117,14 +117,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
      int y = get_global_id(1);
  
      if (x < cols && y < rows)
-    
+
      {
  
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -177,14 +177,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
      int y = get_global_id(1);
  
      if (x < cols && y < rows)
-    
+
      {
  
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -236,18 +236,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
      int y = get_global_id(1);
  
      if (x < cols && y < rows)
-    
+
      {
-            
+
          x = x << 2;
  
          #define bitOfInt  (sizeof(int)== 4 ? 2: 3)
  
          #define dst_align ((dst_offset >> bitOfInt) & 3)
  
-        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); 
-        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); 
-       
+        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
+        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
+
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
@@ -256,7 +256,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
      int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
          int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-        
+
      if(src1_index < 0)
      {
          int4 tmp;
@@ -299,16 +299,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
      int y = get_global_id(1);
  
      if (x < cols && y < rows)
-    
+
      {
-            
+
          x = x << 2;
  
          #define dst_align ((dst_offset >> 2) & 3)
  
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -361,16 +361,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
      int y = get_global_id(1);
  
      if (x < cols && y < rows)
-    
+
      {
-            
+
          x = x << 2;
  
          #define dst_align ((dst_offset >> 3) & 3)
  
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
diff --git a/modules/ocl/src/kernels/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_add_scalar.cl

rename to modules/ocl/src/opencl/arithm_add_scalar.cl
diff --git a/modules/ocl/src/kernels/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_add_scalar_mask.cl

rename to modules/ocl/src/opencl/arithm_add_scalar_mask.cl
diff --git a/modules/ocl/src/kernels/arithm_bitwise_and.cl b/modules/ocl/src/opencl/arithm_bitwise_and.cl

similarity index 95%

rename from modules/ocl/src/kernels/arithm_bitwise_and.cl

rename to modules/ocl/src/opencl/arithm_bitwise_and.cl

index f954452..8adc56d 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_and.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and.cl
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -75,14 +75,14 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
       uchar4 src2_data = vload4(0, src2 + src2_index_fix);
  
       if(src1_index < 0)
-     {     
+     {
          uchar4 tmp;
          tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
          src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        uchar4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        uchar4 tmp;
          tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
          src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
       }
@@ -113,8 +113,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -126,14 +126,14 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
       char4 src2_data = vload4(0, src2 + src2_index_fix);
  
       if(src1_index < 0)
-     {     
+     {
          char4 tmp;
          tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
          src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        char4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        char4 tmp;
          tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
          src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
       }
@@ -164,8 +164,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -177,14 +177,14 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
          ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
  
       if(src1_index < 0)
-     {     
+     {
          ushort4 tmp;
          tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
          src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        ushort4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        ushort4 tmp;
          tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
          src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
       }
@@ -216,8 +216,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -229,14 +229,14 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
          short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
  
       if(src1_index < 0)
-     {     
+     {
          short4 tmp;
          tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
          src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        short4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        short4 tmp;
          tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
          src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
       }
@@ -320,4 +320,3 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_bitwise_and_mask.cl

rename to modules/ocl/src/opencl/arithm_bitwise_and_mask.cl

index d1f745f..595fb2c 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl

rename to modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
diff --git a/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl

rename to modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl

index 50304aa..beafd7e 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
@@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_not.cl b/modules/ocl/src/opencl/arithm_bitwise_not.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_bitwise_not.cl

rename to modules/ocl/src/opencl/arithm_bitwise_not.cl

index 64bcc17..fd9d2cc 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_not.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl
@@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -72,7 +72,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = ~ src1_data;
-        
+
    /*  if(src1_index < 0)
      {
        uchar4 tmp;
@@ -102,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -136,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -171,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -245,14 +245,13 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
      {
          int src_index = mad24(y, src_step, (x << 3) + src_offset);
          int dst_index = mad24(y, dst_step,  (x << 3) + dst_offset);
-         
+
          char8 data;
  
          data = *((__global char8 *)((__global char *)src + src_index));
          data = ~ data;
-        
+
          *((__global char8 *)((__global char *)dst + dst_index)) = data;
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_or.cl b/modules/ocl/src/opencl/arithm_bitwise_or.cl

similarity index 98%

rename from modules/ocl/src/kernels/arithm_bitwise_or.cl

rename to modules/ocl/src/opencl/arithm_bitwise_or.cl

index 01e3a2f..a95e59e 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_or.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or.cl
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -111,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -148,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -276,4 +276,3 @@ __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_bitwise_or_mask.cl

rename to modules/ocl/src/opencl/arithm_bitwise_or_mask.cl

index 92d98ec..aedb68c 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl

rename to modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl

index bbd5f3f..5b94591 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
@@ -911,4 +911,3 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl

rename to modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl

index 1533987..54066c2 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
@@ -1078,4 +1078,3 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor.cl b/modules/ocl/src/opencl/arithm_bitwise_xor.cl

similarity index 95%

rename from modules/ocl/src/kernels/arithm_bitwise_xor.cl

rename to modules/ocl/src/opencl/arithm_bitwise_xor.cl

index 6e83ef5..4f74377 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_xor.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor.cl
@@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -76,14 +76,14 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
          uchar4 src2_data = vload4(0, src2 + src2_index_fix);
  
       if(src1_index < 0)
-     {     
+     {
          uchar4 tmp;
          tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
          src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        uchar4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        uchar4 tmp;
          tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
          src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
       }
@@ -113,8 +113,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -126,14 +126,14 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
          char4 src2_data = vload4(0, src2 + src2_index_fix);
  
       if(src1_index < 0)
-     {     
+     {
          char4 tmp;
          tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
          src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        char4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        char4 tmp;
          tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
          src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
       }
@@ -164,8 +164,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -177,14 +177,14 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
          ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
  
       if(src1_index < 0)
-     {     
+     {
          ushort4 tmp;
          tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
          src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        ushort4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        ushort4 tmp;
          tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
          src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
       }
@@ -216,8 +216,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
@@ -231,14 +231,14 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
          short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
  
       if(src1_index < 0)
-     {     
+     {
          short4 tmp;
          tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
          src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }                 
-     if(src2_index < 0)  
-     {                         
-        short4 tmp;                   
+     }
+     if(src2_index < 0)
+     {
+        short4 tmp;
          tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
          src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
       }
@@ -324,4 +324,3 @@ __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl

rename to modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl

index 248654e..4359d86 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
@@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl

rename to modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl

rename to modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl

index 4efa2da..57ad9ee 100644 (file)
--- a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
@@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_cartToPolar.cl b/modules/ocl/src/opencl/arithm_cartToPolar.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_cartToPolar.cl

rename to modules/ocl/src/opencl/arithm_cartToPolar.cl
diff --git a/modules/ocl/src/kernels/arithm_compare_eq.cl b/modules/ocl/src/opencl/arithm_compare_eq.cl

similarity index 74%

rename from modules/ocl/src/kernels/arithm_compare_eq.cl

rename to modules/ocl/src/opencl/arithm_compare_eq.cl

index 1db0b7d..f818532 100644 (file)
--- a/modules/ocl/src/kernels/arithm_compare_eq.cl
+++ b/modules/ocl/src/opencl/arithm_compare_eq.cl
@@ -63,31 +63,31 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          uchar4 src1_data = vload4(0, src1 + src1_index_fix);
          uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-               if(src1_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
-
-  
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
+
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
  
@@ -115,29 +115,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));                
-               if(src1_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@@ -166,32 +166,32 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));          
-               if(src1_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
-
-
-  
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
+
+
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
  
@@ -215,32 +215,32 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
      int y = get_global_id(1);
  
      if (x < cols && y < rows)
-    {   
+    {
          x = x << 2;
          #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
  
           int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
          int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-               if(src1_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@@ -266,22 +266,22 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));              if(src2_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));       if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@@ -308,29 +308,29 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
          double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
-               if(src1_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@@ -359,31 +359,31 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          uchar4 src1_data = vload4(0, src1 + src1_index_fix);
          uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-               if(src1_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
-
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
+
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
  
@@ -410,31 +410,31 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));                
-               if(src1_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
-  
-
- 
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
+
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
  
@@ -463,29 +463,29 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));          
-               if(src1_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -512,31 +512,31 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
  
           int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
          int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-               if(src1_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
-
- 
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
  
@@ -561,29 +561,29 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
          float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-               if(src1_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@@ -610,29 +610,29 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));    
-               if(src1_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@@ -661,30 +661,30 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          uchar4 src1_data = vload4(0, src1 + src1_index_fix);
          uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-               if(src1_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -715,30 +715,30 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));                
-               if(src1_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
-  
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
  
  
@@ -770,30 +770,30 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));          
-               if(src1_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -821,30 +821,30 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
          x = x << 2;
  
          #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
  
          int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
          int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-               if(src1_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
  
@@ -870,30 +870,30 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
          float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-               if(src1_index < 0)
-               {
-
-                       float4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
+        if(src1_index < 0)
+        {
+
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@@ -921,28 +921,28 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));    
-               if(src1_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }                       uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }               uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
  
          dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@@ -954,4 +954,3 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
      }
  }
  #endif
-
diff --git a/modules/ocl/src/kernels/arithm_compare_ne.cl b/modules/ocl/src/opencl/arithm_compare_ne.cl

similarity index 73%

rename from modules/ocl/src/kernels/arithm_compare_ne.cl

rename to modules/ocl/src/opencl/arithm_compare_ne.cl

index 1c5063a..713dc13 100644 (file)
--- a/modules/ocl/src/kernels/arithm_compare_ne.cl
+++ b/modules/ocl/src/opencl/arithm_compare_ne.cl
@@ -59,29 +59,29 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          uchar4 src1_data = vload4(0, src1 + src1_index_fix);
          uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-               if(src1_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -111,29 +111,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));                
-               if(src1_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -163,29 +163,29 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1)& 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));          
-               if(src1_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -211,30 +211,30 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-       
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
  
          int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
          int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-               if(src1_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -260,28 +260,28 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));              if(src1_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));       if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
  
@@ -307,29 +307,29 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));    
-               if(src1_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@@ -344,7 +344,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
  }
  #endif
  
-   
+
  /***********************************Compare LT*******************************/
  __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
                               __global uchar *src2, int src2_step, int src2_offset,
@@ -359,29 +359,29 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          uchar4 src1_data = vload4(0, src1 + src1_index_fix);
          uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-               if(src1_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -411,30 +411,30 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));                
-               if(src1_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
-  
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -464,29 +464,29 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));          
-               if(src1_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -513,34 +513,34 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
  
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
  
          int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
          int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-               if(src1_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
-
-
- 
-   
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
+
+
+
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
  
@@ -565,29 +565,29 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 2) & 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
          float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
-               if(src1_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -614,29 +614,29 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 3) & 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));    
-               if(src1_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
         uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@@ -665,29 +665,29 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align (dst_offset & 3)
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); 
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); 
+        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          uchar4 src1_data = vload4(0, src1 + src1_index_fix);
          uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-               if(src1_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       uchar4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        if(src1_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            uchar4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -718,29 +718,29 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
-        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));                
-               if(src1_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
-  
+        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -771,29 +771,29 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
          x = x << 2;
  
          #define dst_align ((dst_offset >> 1) & 3)
-        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); 
-        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); 
+        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
-        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));          
-               if(src1_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+        if(src1_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
  
          uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
@@ -820,29 +820,29 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
  
          int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
          int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
-               if(src1_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       int4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
+        if(src1_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            int4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
@@ -868,28 +868,28 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 2)& 3)
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
-        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));              
-               if(src1_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       float4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
+        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            float4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@@ -916,29 +916,29 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
      {
          x = x << 2;
          #define dst_align ((dst_offset >> 3)& 3)
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); 
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
  
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-               int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-               int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
          double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
-        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));    
-               if(src1_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-                       src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-               }
-               if(src2_index < 0)
-               {
-                       double4 tmp;
-                       tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-                       src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-               }               
- 
+        double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+        if(src1_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+        }
+        if(src2_index < 0)
+        {
+            double4 tmp;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+        }
+
  
          uchar4 dst_data  = *((__global uchar4 *)(dst  + dst_index));
          uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@@ -952,5 +952,3 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
      }
  }
  #endif
-
-
diff --git a/modules/ocl/src/kernels/arithm_div.cl b/modules/ocl/src/opencl/arithm_div.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_div.cl

rename to modules/ocl/src/opencl/arithm_div.cl

index 54fe3cd..dcbe303 100644 (file)
--- a/modules/ocl/src/kernels/arithm_div.cl
+++ b/modules/ocl/src/opencl/arithm_div.cl
@@ -455,5 +455,3 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
      }
  }
  #endif
-
-
diff --git a/modules/ocl/src/kernels/arithm_exp.cl b/modules/ocl/src/opencl/arithm_exp.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_exp.cl

rename to modules/ocl/src/opencl/arithm_exp.cl
diff --git a/modules/ocl/src/kernels/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_flip.cl

rename to modules/ocl/src/opencl/arithm_flip.cl
diff --git a/modules/ocl/src/kernels/arithm_flip_rc.cl b/modules/ocl/src/opencl/arithm_flip_rc.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_flip_rc.cl

rename to modules/ocl/src/opencl/arithm_flip_rc.cl
diff --git a/modules/ocl/src/kernels/arithm_log.cl b/modules/ocl/src/opencl/arithm_log.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_log.cl

rename to modules/ocl/src/opencl/arithm_log.cl
diff --git a/modules/ocl/src/kernels/arithm_magnitude.cl b/modules/ocl/src/opencl/arithm_magnitude.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_magnitude.cl

rename to modules/ocl/src/opencl/arithm_magnitude.cl
diff --git a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl

similarity index 98%

rename from modules/ocl/src/kernels/arithm_magnitudeSqr.cl

rename to modules/ocl/src/opencl/arithm_magnitudeSqr.cl

index f1d0aa5..3fd697f 100644 (file)
--- a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl
+++ b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl
@@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
      int y = get_global_id(1);
  
      if (x < cols && y < rows)
-    
+
  
      {
-            
+
          x = x << 2;
  
          #define dst_align ((dst_offset >> 2) & 3)
  
-        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); 
-        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -125,16 +125,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
      int y = get_global_id(1);
  
      if (x < cols && y < rows)
-    
+
  
      {
-            
+
          x = x << 2;
  
          #define dst_align ((dst_offset >> 2) & 3)
  
-        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); 
-       
+        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+
          int dst_start  = mad24(y, dst_step, dst_offset);
          int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
          int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
@@ -148,8 +148,8 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
            src1_data.s01234567 = src1_data.s45670123;
      if(src1_index== -2)
            src1_data.s01234567 = src1_data.s23456701;
-        
-    
+
+
  
          float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
  
diff --git a/modules/ocl/src/kernels/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_minMax.cl

rename to modules/ocl/src/opencl/arithm_minMax.cl
diff --git a/modules/ocl/src/kernels/arithm_minMaxLoc.cl b/modules/ocl/src/opencl/arithm_minMaxLoc.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_minMaxLoc.cl

rename to modules/ocl/src/opencl/arithm_minMaxLoc.cl
diff --git a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl

rename to modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl

index f87b928..0af4f7b 100644 (file)
--- a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl
+++ b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl
@@ -240,4 +240,3 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
         dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
     }
  }
-
diff --git a/modules/ocl/src/kernels/arithm_minMax_mask.cl b/modules/ocl/src/opencl/arithm_minMax_mask.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_minMax_mask.cl

rename to modules/ocl/src/opencl/arithm_minMax_mask.cl

index 4097762..734ccab 100644 (file)
--- a/modules/ocl/src/kernels/arithm_minMax_mask.cl
+++ b/modules/ocl/src/opencl/arithm_minMax_mask.cl
@@ -194,4 +194,3 @@ __kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int el
         dst[gid + groupnum] = localmem_max[0];
     }
  }
-
diff --git a/modules/ocl/src/kernels/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_mul.cl

rename to modules/ocl/src/opencl/arithm_mul.cl
diff --git a/modules/ocl/src/kernels/arithm_nonzero.cl b/modules/ocl/src/opencl/arithm_nonzero.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_nonzero.cl

rename to modules/ocl/src/opencl/arithm_nonzero.cl
diff --git a/modules/ocl/src/kernels/arithm_phase.cl b/modules/ocl/src/opencl/arithm_phase.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_phase.cl

rename to modules/ocl/src/opencl/arithm_phase.cl
diff --git a/modules/ocl/src/kernels/arithm_polarToCart.cl b/modules/ocl/src/opencl/arithm_polarToCart.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_polarToCart.cl

rename to modules/ocl/src/opencl/arithm_polarToCart.cl
diff --git a/modules/ocl/src/kernels/arithm_pow.cl b/modules/ocl/src/opencl/arithm_pow.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_pow.cl

rename to modules/ocl/src/opencl/arithm_pow.cl
diff --git a/modules/ocl/src/kernels/arithm_sub.cl b/modules/ocl/src/opencl/arithm_sub.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_sub.cl

rename to modules/ocl/src/opencl/arithm_sub.cl
diff --git a/modules/ocl/src/kernels/arithm_sub_scalar.cl b/modules/ocl/src/opencl/arithm_sub_scalar.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_sub_scalar.cl

rename to modules/ocl/src/opencl/arithm_sub_scalar.cl
diff --git a/modules/ocl/src/kernels/arithm_sub_scalar_mask.cl b/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_sub_scalar_mask.cl

rename to modules/ocl/src/opencl/arithm_sub_scalar_mask.cl
diff --git a/modules/ocl/src/kernels/arithm_sum.cl b/modules/ocl/src/opencl/arithm_sum.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_sum.cl

rename to modules/ocl/src/opencl/arithm_sum.cl

index d29a71c..280b0a5 100644 (file)
--- a/modules/ocl/src/kernels/arithm_sum.cl
+++ b/modules/ocl/src/opencl/arithm_sum.cl
@@ -203,4 +203,3 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in
         dst[gid] = localmem_sum[0];
     }
  }
-
diff --git a/modules/ocl/src/kernels/arithm_sum_3.cl b/modules/ocl/src/opencl/arithm_sum_3.cl

similarity index 99%

rename from modules/ocl/src/kernels/arithm_sum_3.cl

rename to modules/ocl/src/opencl/arithm_sum_3.cl

index 1401889..3f6ed08 100644 (file)
--- a/modules/ocl/src/kernels/arithm_sum_3.cl
+++ b/modules/ocl/src/opencl/arithm_sum_3.cl
@@ -245,4 +245,3 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum,
         dst[gid*3+2] = localmem_sum3[0];
     }
  }
-
diff --git a/modules/ocl/src/kernels/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl

similarity index 100%

rename from modules/ocl/src/kernels/arithm_transpose.cl

rename to modules/ocl/src/opencl/arithm_transpose.cl
diff --git a/modules/ocl/src/kernels/blend_linear.cl b/modules/ocl/src/opencl/blend_linear.cl

similarity index 98%

rename from modules/ocl/src/kernels/blend_linear.cl

rename to modules/ocl/src/opencl/blend_linear.cl

index 06bde2f..50c5c39 100644 (file)
--- a/modules/ocl/src/kernels/blend_linear.cl
+++ b/modules/ocl/src/opencl/blend_linear.cl
@@ -15,7 +15,7 @@
  // Third party copyrights are property of their respective owners.
  //
  // @Authors
-//    Liu Liujun, liujun@multicorewareinc.com 
+//    Liu Liujun, liujun@multicorewareinc.com
  //
  // Redistribution and use in source and binary forms, with or without modification,
  // are permitted provided that the following conditions are met:
@@ -61,7 +61,7 @@ __kernel void BlendLinear_C1_D0(
          int pos = mad24(idy,istep >> 2,idx);
          int wpos = mad24(idy,wstep >> 2,idx);
          float4 w1 = weight1[wpos], w2 = weight2[wpos];
-        dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + 
+        dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
              convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
      }
  }
@@ -86,7 +86,7 @@ __kernel void BlendLinear_C4_D0(
          int wpos = mad24(idy,wstep, idx);
          float w1 = weight1[wpos];
          float w2 = weight2[wpos];
-        dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + 
+        dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
              convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
      }
  }
@@ -138,4 +138,3 @@ __kernel void BlendLinear_C4_D5(
          dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
      }
  }
-
diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl

new file mode 100644 (file)

index 0000000..0730ac5
--- /dev/null
+++ b/modules/ocl/src/opencl/brute_force_match.cl
@@ -0,0 +1,865 @@
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+#define MAX_FLOAT 1e7f
+
+int bit1Count(float x)
+{
+    int c = 0;
+    int ix = (int)x;
+
+    for (int i = 0 ; i < 32 ; i++)
+    {
+        c += ix & 0x1;
+        ix >>= 1;
+    }
+
+    return (float)c;
+}
+/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
+local size: dim0 is block_size, dim1 is block_size.
+*/
+__kernel void BruteForceMatch_UnrollMatch(
+    __global float *query,
+    __global float *train,
+    //__global float *mask,
+    __global int *bestTrainIdx,
+    __global float *bestDistance,
+    __local float *sharebuffer,
+    int block_size,
+    int max_desc_len,
+    int query_rows,
+    int query_cols,
+    int train_rows,
+    int train_cols,
+    int step,
+    int distType
+)
+{
+    const int lidx = get_local_id(0);
+    const int lidy = get_local_id(1);
+    const int groupidx = get_group_id(0);
+
+    __local float *s_query = sharebuffer;
+    __local float *s_train = sharebuffer + block_size * max_desc_len;
+
+    int queryIdx = groupidx * block_size + lidy;
+
+    // load the query into local memory.
+    for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
+    {
+        int loadx = lidx + i * block_size;
+        s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+    }
+
+    float myBestDistance = MAX_FLOAT;
+    int myBestTrainIdx = -1;
+
+    // loopUnrolledCached to find the best trainIdx and best distance.
+    volatile int imgIdx = 0;
+
+    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+    {
+        float result = 0;
+
+        for (int i = 0 ; i < max_desc_len / block_size ; i++)
+        {
+            //load a block_size * block_size block into local train.
+            const int loadx = lidx + i * block_size;
+            s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+            //synchronize to make sure each elem for reduceIteration in share memory is written already.
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+            switch (distType)
+            {
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        int trainIdx = t * block_size + lidx;
+
+        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
+        {
+            //bestImgIdx = imgIdx;
+            myBestDistance = result;
+            myBestTrainIdx = trainIdx;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    __local float *s_distance = (__local float *)(sharebuffer);
+    __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+
+    //find BestMatch
+    s_distance += lidy * block_size;
+    s_trainIdx += lidy * block_size;
+    s_distance[lidx] = myBestDistance;
+    s_trainIdx[lidx] = myBestTrainIdx;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //reduce -- now all reduce implement in each threads.
+    for (int k = 0 ; k < block_size; k++)
+    {
+        if (myBestDistance > s_distance[k])
+        {
+            myBestDistance = s_distance[k];
+            myBestTrainIdx = s_trainIdx[k];
+        }
+    }
+
+    if (queryIdx < query_rows && lidx == 0)
+    {
+        bestTrainIdx[queryIdx] = myBestTrainIdx;
+        bestDistance[queryIdx] = myBestDistance;
+    }
+}
+
+__kernel void BruteForceMatch_Match(
+    __global float *query,
+    __global float *train,
+    //__global float *mask,
+    __global int *bestTrainIdx,
+    __global float *bestDistance,
+    __local float *sharebuffer,
+    int block_size,
+    int query_rows,
+    int query_cols,
+    int train_rows,
+    int train_cols,
+    int step,
+    int distType
+)
+{
+    const int lidx = get_local_id(0);
+    const int lidy = get_local_id(1);
+    const int groupidx = get_group_id(0);
+
+    const int queryIdx = groupidx * block_size + lidy;
+
+    float myBestDistance = MAX_FLOAT;
+    int myBestTrainIdx = -1;
+
+    __local float *s_query = sharebuffer;
+    __local float *s_train = sharebuffer + block_size * block_size;
+
+    // loop
+    for (int t = 0 ;  t < (train_rows + block_size - 1) / block_size ; t++)
+    {
+        //Dist dist;
+        float result = 0;
+
+        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
+        {
+            const int loadx = lidx + i * block_size;
+            //load query and train into local memory
+            s_query[lidy * block_size + lidx] = 0;
+            s_train[lidx * block_size + lidy] = 0;
+
+            if (loadx < query_cols)
+            {
+                s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+                s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+            switch (distType)
+            {
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+                    }
+
+                    break;
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        const int trainIdx = t * block_size + lidx;
+
+        if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
+        {
+            //myBestImgidx = imgIdx;
+            myBestDistance = result;
+            myBestTrainIdx = trainIdx;
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __local float *s_distance = (__local float *)sharebuffer;
+    __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+
+    //findBestMatch
+    s_distance += lidy * block_size;
+    s_trainIdx += lidy * block_size;
+    s_distance[lidx] = myBestDistance;
+    s_trainIdx[lidx] = myBestTrainIdx;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //reduce -- now all reduce implement in each threads.
+    for (int k = 0 ; k < block_size; k++)
+    {
+        if (myBestDistance > s_distance[k])
+        {
+            myBestDistance = s_distance[k];
+            myBestTrainIdx = s_trainIdx[k];
+        }
+    }
+
+    if (queryIdx < query_rows && lidx == 0)
+    {
+        bestTrainIdx[queryIdx] = myBestTrainIdx;
+        bestDistance[queryIdx] = myBestDistance;
+    }
+}
+
+//radius_unrollmatch
+__kernel void BruteForceMatch_RadiusUnrollMatch(
+    __global float *query,
+    __global float *train,
+    float maxDistance,
+    //__global float *mask,
+    __global int *bestTrainIdx,
+    __global float *bestDistance,
+    __global int *nMatches,
+    __local float *sharebuffer,
+    int block_size,
+    int max_desc_len,
+    int query_rows,
+    int query_cols,
+    int train_rows,
+    int train_cols,
+    int bestTrainIdx_cols,
+    int step,
+    int ostep,
+    int distType
+)
+{
+    const int lidx = get_local_id(0);
+    const int lidy = get_local_id(1);
+    const int groupidx = get_group_id(0);
+    const int groupidy = get_group_id(1);
+
+    const int queryIdx = groupidy * block_size + lidy;
+    const int trainIdx = groupidx * block_size + lidx;
+
+    __local float *s_query = sharebuffer;
+    __local float *s_train = sharebuffer + block_size * block_size;
+
+    float result = 0;
+
+    for (int i = 0 ; i < max_desc_len / block_size ; ++i)
+    {
+        //load a block_size * block_size block into local train.
+        const int loadx = lidx + i * block_size;
+
+        s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+        s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+
+        //synchronize to make sure each elem for reduceIteration in share memory is written already.
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+        switch (distType)
+        {
+            case 0:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+                }
+
+                break;
+            case 1:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+                    result += qr * qr;
+                }
+
+                break;
+            case 2:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                }
+
+                break;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
+    {
+        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+
+        if (ind < bestTrainIdx_cols)
+        {
+            //bestImgIdx = imgIdx;
+            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
+            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+        }
+    }
+}
+
+//radius_match
+__kernel void BruteForceMatch_RadiusMatch(
+    __global float *query,
+    __global float *train,
+    float maxDistance,
+    //__global float *mask,
+    __global int *bestTrainIdx,
+    __global float *bestDistance,
+    __global int *nMatches,
+    __local float *sharebuffer,
+    int block_size,
+    int query_rows,
+    int query_cols,
+    int train_rows,
+    int train_cols,
+    int bestTrainIdx_cols,
+    int step,
+    int ostep,
+    int distType
+)
+{
+    const int lidx = get_local_id(0);
+    const int lidy = get_local_id(1);
+    const int groupidx = get_group_id(0);
+    const int groupidy = get_group_id(1);
+
+    const int queryIdx = groupidy * block_size + lidy;
+    const int trainIdx = groupidx * block_size + lidx;
+
+    __local float *s_query = sharebuffer;
+    __local float *s_train = sharebuffer + block_size * block_size;
+
+    float result = 0;
+
+    for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
+    {
+        //load a block_size * block_size block into local train.
+        const int loadx = lidx + i * block_size;
+
+        s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+        s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+
+        //synchronize to make sure each elem for reduceIteration in share memory is written already.
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+        sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+        switch (distType)
+        {
+            case 0:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+                }
+
+                break;
+            case 1:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+                    result += qr * qr;
+                }
+
+                break;
+            case 2:
+
+                for (int j = 0 ; j < block_size ; ++j)
+                {
+                    result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                }
+
+                break;
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
+    {
+        unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+
+        if (ind < bestTrainIdx_cols)
+        {
+            //bestImgIdx = imgIdx;
+            bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
+            bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+        }
+    }
+}
+
+
+__kernel void BruteForceMatch_knnUnrollMatch(
+    __global float *query,
+    __global float *train,
+    //__global float *mask,
+    __global int2 *bestTrainIdx,
+    __global float2 *bestDistance,
+    __local float *sharebuffer,
+    int block_size,
+    int max_desc_len,
+    int query_rows,
+    int query_cols,
+    int train_rows,
+    int train_cols,
+    int step,
+    int distType
+)
+{
+    const int lidx = get_local_id(0);
+    const int lidy = get_local_id(1);
+    const int groupidx = get_group_id(0);
+
+    const int queryIdx = groupidx * block_size + lidy;
+    local float *s_query = sharebuffer;
+    local float *s_train = sharebuffer + block_size * max_desc_len;
+
+    // load the query into local memory.
+    for (int i = 0 ;  i <  max_desc_len / block_size; i ++)
+    {
+        int loadx = lidx + i * block_size;
+        s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1)  * (step / sizeof(float)) + loadx] : 0;
+    }
+
+    float myBestDistance1 = MAX_FLOAT;
+    float myBestDistance2 = MAX_FLOAT;
+    int myBestTrainIdx1 = -1;
+    int myBestTrainIdx2 = -1;
+
+    //loopUnrolledCached
+    volatile int imgIdx = 0;
+
+    for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+    {
+        float result = 0;
+
+        for (int i = 0 ; i < max_desc_len / block_size ; i++)
+        {
+            const int loadX = lidx + i * block_size;
+            //load a block_size * block_size block into local train.
+            const int loadx = lidx + i * block_size;
+            s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+            //synchronize to make sure each elem for reduceIteration in share memory is written already.
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+            switch (distType)
+            {
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * max_desc_len + i * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        const int trainIdx = t * block_size + lidx;
+
+        if (queryIdx < query_rows && trainIdx < train_rows)
+        {
+            if (result < myBestDistance1)
+            {
+                myBestDistance2 = myBestDistance1;
+                myBestTrainIdx2 = myBestTrainIdx1;
+                myBestDistance1 = result;
+                myBestTrainIdx1 = trainIdx;
+            }
+            else if (result < myBestDistance2)
+            {
+                myBestDistance2 = result;
+                myBestTrainIdx2 = trainIdx;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    local float *s_distance = (local float *)sharebuffer;
+    local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
+
+    // find BestMatch
+    s_distance += lidy * block_size;
+    s_trainIdx += lidy * block_size;
+
+    s_distance[lidx] = myBestDistance1;
+    s_trainIdx[lidx] = myBestTrainIdx1;
+
+    float bestDistance1 = MAX_FLOAT;
+    float bestDistance2 = MAX_FLOAT;
+    int bestTrainIdx1 = -1;
+    int bestTrainIdx2 = -1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lidx == 0)
+    {
+        for (int i = 0 ; i < block_size ; i++)
+        {
+            float val = s_distance[i];
+
+            if (val < bestDistance1)
+            {
+                bestDistance2 = bestDistance1;
+                bestTrainIdx2 = bestTrainIdx1;
+
+                bestDistance1 = val;
+                bestTrainIdx1 = s_trainIdx[i];
+            }
+            else if (val < bestDistance2)
+            {
+                bestDistance2 = val;
+                bestTrainIdx2 = s_trainIdx[i];
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    s_distance[lidx] = myBestDistance2;
+    s_trainIdx[lidx] = myBestTrainIdx2;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lidx == 0)
+    {
+        for (int i = 0 ; i < block_size ; i++)
+        {
+            float val = s_distance[i];
+
+            if (val < bestDistance2)
+            {
+                bestDistance2 = val;
+                bestTrainIdx2 = s_trainIdx[i];
+            }
+        }
+    }
+
+    myBestDistance1 = bestDistance1;
+    myBestDistance2 = bestDistance2;
+
+    myBestTrainIdx1 = bestTrainIdx1;
+    myBestTrainIdx2 = bestTrainIdx2;
+
+    if (queryIdx < query_rows && lidx == 0)
+    {
+        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
+        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
+    }
+}
+
+__kernel void BruteForceMatch_knnMatch(
+    __global float *query,
+    __global float *train,
+    //__global float *mask,
+    __global int2 *bestTrainIdx,
+    __global float2 *bestDistance,
+    __local float *sharebuffer,
+    int block_size,
+    int query_rows,
+    int query_cols,
+    int train_rows,
+    int train_cols,
+    int step,
+    int distType
+)
+{
+    const int lidx = get_local_id(0);
+    const int lidy = get_local_id(1);
+    const int groupidx = get_group_id(0);
+
+    const int queryIdx = groupidx * block_size + lidy;
+    local float *s_query = sharebuffer;
+    local float *s_train = sharebuffer + block_size * block_size;
+
+    float myBestDistance1 = MAX_FLOAT;
+    float myBestDistance2 = MAX_FLOAT;
+    int myBestTrainIdx1 = -1;
+    int myBestTrainIdx2 = -1;
+
+    //loop
+    for (int  t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+    {
+        float result = 0.0f;
+
+        for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
+        {
+            const int loadx = lidx + i * block_size;
+            //load query and train into local memory
+            s_query[lidy * block_size + lidx] = 0;
+            s_train[lidx * block_size + lidy] = 0;
+
+            if (loadx < query_cols)
+            {
+                s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+                s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+
+            /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+            sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+            switch (distType)
+            {
+                case 0:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        result += fabs(s_query[lidy * block_size + j] -  s_train[j * block_size + lidx]);
+                    }
+
+                    break;
+                case 1:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        float qr = s_query[lidy * block_size + j] -  s_train[j * block_size + lidx];
+                        result += qr * qr;
+                    }
+
+                    break;
+                case 2:
+
+                    for (int j = 0 ; j < block_size ; j++)
+                    {
+                        //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+                        result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+                    }
+
+                    break;
+            }
+
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        const int trainIdx = t * block_size + lidx;
+
+        if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
+        {
+            if (result < myBestDistance1)
+            {
+                myBestDistance2 = myBestDistance1;
+                myBestTrainIdx2 = myBestTrainIdx1;
+                myBestDistance1 = result;
+                myBestTrainIdx1 = trainIdx;
+            }
+            else if (result < myBestDistance2)
+            {
+                myBestDistance2 = result;
+                myBestTrainIdx2 = trainIdx;
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    __local float *s_distance = (__local float *)sharebuffer;
+    __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+
+    //findBestMatch
+    s_distance += lidy * block_size;
+    s_trainIdx += lidy * block_size;
+
+    s_distance[lidx] = myBestDistance1;
+    s_trainIdx[lidx] = myBestTrainIdx1;
+
+    float bestDistance1 = MAX_FLOAT;
+    float bestDistance2 = MAX_FLOAT;
+    int bestTrainIdx1 = -1;
+    int bestTrainIdx2 = -1;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lidx == 0)
+    {
+        for (int i = 0 ; i < block_size ; i++)
+        {
+            float val = s_distance[i];
+
+            if (val < bestDistance1)
+            {
+                bestDistance2 = bestDistance1;
+                bestTrainIdx2 = bestTrainIdx1;
+
+                bestDistance1 = val;
+                bestTrainIdx1 = s_trainIdx[i];
+            }
+            else if (val < bestDistance2)
+            {
+                bestDistance2 = val;
+                bestTrainIdx2 = s_trainIdx[i];
+            }
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    s_distance[lidx] = myBestDistance2;
+    s_trainIdx[lidx] = myBestTrainIdx2;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (lidx == 0)
+    {
+        for (int i = 0 ; i < block_size ; i++)
+        {
+            float val = s_distance[i];
+
+            if (val < bestDistance2)
+            {
+                bestDistance2 = val;
+                bestTrainIdx2 = s_trainIdx[i];
+            }
+        }
+    }
+
+    myBestDistance1 = bestDistance1;
+    myBestDistance2 = bestDistance2;
+
+    myBestTrainIdx1 = bestTrainIdx1;
+    myBestTrainIdx2 = bestTrainIdx2;
+
+    if (queryIdx < query_rows && lidx == 0)
+    {
+        bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
+        bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
+    }
+}
+
+kernel void BruteForceMatch_calcDistanceUnrolled(
+    __global float *query,
+    __global float *train,
+    //__global float *mask,
+    __global float *allDist,
+    __local float *sharebuffer,
+    int block_size,
+    int max_desc_len,
+    int query_rows,
+    int query_cols,
+    int train_rows,
+    int train_cols,
+    int step,
+    int distType)
+{
+    /* Todo */
+}
+
+kernel void BruteForceMatch_calcDistance(
+    __global float *query,
+    __global float *train,
+    //__global float *mask,
+    __global float *allDist,
+    __local float *sharebuffer,
+    int block_size,
+    int query_rows,
+    int query_cols,
+    int train_rows,
+    int train_cols,
+    int step,
+    int distType)
+{
+    /* Todo */
+}
+
+kernel void BruteForceMatch_findBestMatch(
+    __global float *allDist,
+    __global int *bestTrainIdx,
+    __global float *bestDistance,
+    int k,
+    int block_size
+)
+{
+    /* Todo */
+}
+\ No newline at end of file
diff --git a/modules/ocl/src/kernels/build_warps.cl b/modules/ocl/src/opencl/build_warps.cl

similarity index 99%

rename from modules/ocl/src/kernels/build_warps.cl

rename to modules/ocl/src/opencl/build_warps.cl

index 13d7bb9..07cccee 100644 (file)
--- a/modules/ocl/src/kernels/build_warps.cl
+++ b/modules/ocl/src/opencl/build_warps.cl
@@ -234,4 +234,3 @@ __kernel
          map_y[y * step_y + x] = ycoo;
      }
  }
-
diff --git a/modules/ocl/src/kernels/convertC3C4.cl b/modules/ocl/src/opencl/convertC3C4.cl

similarity index 100%

rename from modules/ocl/src/kernels/convertC3C4.cl

rename to modules/ocl/src/opencl/convertC3C4.cl
diff --git a/modules/ocl/src/kernels/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl

similarity index 100%

rename from modules/ocl/src/kernels/cvt_color.cl

rename to modules/ocl/src/opencl/cvt_color.cl
diff --git a/modules/ocl/src/kernels/filter_sep_col.cl b/modules/ocl/src/opencl/filter_sep_col.cl

similarity index 100%

rename from modules/ocl/src/kernels/filter_sep_col.cl

rename to modules/ocl/src/opencl/filter_sep_col.cl
diff --git a/modules/ocl/src/kernels/filter_sep_row.cl b/modules/ocl/src/opencl/filter_sep_row.cl

similarity index 99%

rename from modules/ocl/src/kernels/filter_sep_row.cl

rename to modules/ocl/src/opencl/filter_sep_row.cl

index dbca8bd..bfe6cd4 100644 (file)
--- a/modules/ocl/src/kernels/filter_sep_row.cl
+++ b/modules/ocl/src/opencl/filter_sep_row.cl
@@ -466,5 +466,3 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_
          dst[start_addr] = sum;
      }
  }
-
-
diff --git a/modules/ocl/src/kernels/filtering_boxFilter.cl b/modules/ocl/src/opencl/filtering_boxFilter.cl

similarity index 100%

rename from modules/ocl/src/kernels/filtering_boxFilter.cl

rename to modules/ocl/src/opencl/filtering_boxFilter.cl
diff --git a/modules/ocl/src/kernels/filtering_laplacian.cl b/modules/ocl/src/opencl/filtering_laplacian.cl

similarity index 100%

rename from modules/ocl/src/kernels/filtering_laplacian.cl

rename to modules/ocl/src/opencl/filtering_laplacian.cl
diff --git a/modules/ocl/src/kernels/filtering_morph.cl b/modules/ocl/src/opencl/filtering_morph.cl

similarity index 100%

rename from modules/ocl/src/kernels/filtering_morph.cl

rename to modules/ocl/src/opencl/filtering_morph.cl
diff --git a/modules/ocl/src/kernels/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl

similarity index 99%

rename from modules/ocl/src/kernels/haarobjectdetect.cl

rename to modules/ocl/src/opencl/haarobjectdetect.cl

index 7835b4b..2fa0906 100644 (file)
--- a/modules/ocl/src/kernels/haarobjectdetect.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect.cl
@@ -559,7 +559,3 @@ if(result)
  }
  }
  */
-
-
-
-
diff --git a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl

similarity index 99%

rename from modules/ocl/src/kernels/haarobjectdetect_scaled2.cl

rename to modules/ocl/src/opencl/haarobjectdetect_scaled2.cl

index 22d3004..9912b9c 100644 (file)
--- a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl
+++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
@@ -283,4 +283,3 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH
          newnode[counter].alpha[0] = t1.alpha[0];
          newnode[counter].alpha[1] = t1.alpha[1];
  }
-
diff --git a/modules/ocl/src/kernels/imgproc_bilateral.cl b/modules/ocl/src/opencl/imgproc_bilateral.cl

similarity index 100%

rename from modules/ocl/src/kernels/imgproc_bilateral.cl

rename to modules/ocl/src/opencl/imgproc_bilateral.cl
diff --git a/modules/ocl/src/kernels/imgproc_calcHarris.cl b/modules/ocl/src/opencl/imgproc_calcHarris.cl

similarity index 100%

rename from modules/ocl/src/kernels/imgproc_calcHarris.cl

rename to modules/ocl/src/opencl/imgproc_calcHarris.cl
diff --git a/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl

similarity index 100%

rename from modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl

rename to modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl
diff --git a/modules/ocl/src/kernels/imgproc_canny.cl b/modules/ocl/src/opencl/imgproc_canny.cl

similarity index 100%

rename from modules/ocl/src/kernels/imgproc_canny.cl

rename to modules/ocl/src/opencl/imgproc_canny.cl
diff --git a/modules/ocl/src/kernels/imgproc_columnsum.cl b/modules/ocl/src/opencl/imgproc_columnsum.cl

similarity index 100%

rename from modules/ocl/src/kernels/imgproc_columnsum.cl

rename to modules/ocl/src/opencl/imgproc_columnsum.cl
diff --git a/modules/ocl/src/kernels/imgproc_convolve.cl b/modules/ocl/src/opencl/imgproc_convolve.cl

similarity index 99%

rename from modules/ocl/src/kernels/imgproc_convolve.cl

rename to modules/ocl/src/opencl/imgproc_convolve.cl

index d113eb8..76e7cfc 100644 (file)
--- a/modules/ocl/src/kernels/imgproc_convolve.cl
+++ b/modules/ocl/src/opencl/imgproc_convolve.cl
@@ -107,5 +107,3 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global
          dst[gy*(dst_step >> 2)+gx] = res;
     }
  }
-
-
diff --git a/modules/ocl/src/kernels/imgproc_copymakeboder.cl b/modules/ocl/src/opencl/imgproc_copymakeboder.cl

similarity index 100%

rename from modules/ocl/src/kernels/imgproc_copymakeboder.cl

rename to modules/ocl/src/opencl/imgproc_copymakeboder.cl
diff --git a/modules/ocl/src/kernels/imgproc_histogram.cl b/modules/ocl/src/opencl/imgproc_histogram.cl

similarity index 99%

rename from modules/ocl/src/kernels/imgproc_histogram.cl

rename to modules/ocl/src/opencl/imgproc_histogram.cl

index 01e333f..6bfa095 100644 (file)
--- a/modules/ocl/src/kernels/imgproc_histogram.cl
+++ b/modules/ocl/src/opencl/imgproc_histogram.cl
@@ -267,4 +267,3 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist(
      }
  }
  */
-
diff --git a/modules/ocl/src/kernels/imgproc_integral.cl b/modules/ocl/src/opencl/imgproc_integral.cl

similarity index 100%

rename from modules/ocl/src/kernels/imgproc_integral.cl

rename to modules/ocl/src/opencl/imgproc_integral.cl
diff --git a/modules/ocl/src/kernels/imgproc_integral_sum.cl b/modules/ocl/src/opencl/imgproc_integral_sum.cl

similarity index 100%

rename from modules/ocl/src/kernels/imgproc_integral_sum.cl

rename to modules/ocl/src/opencl/imgproc_integral_sum.cl
diff --git a/modules/ocl/src/kernels/imgproc_median.cl b/modules/ocl/src/opencl/imgproc_median.cl

similarity index 99%

rename from modules/ocl/src/kernels/imgproc_median.cl

rename to modules/ocl/src/opencl/imgproc_median.cl

index 2d9cd45..b87af96 100644 (file)
--- a/modules/ocl/src/kernels/imgproc_median.cl
+++ b/modules/ocl/src/opencl/imgproc_median.cl
@@ -484,4 +484,3 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst,  i
          dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
  }
  #undef op(a,b)
-
diff --git a/modules/ocl/src/kernels/imgproc_remap.cl b/modules/ocl/src/opencl/imgproc_remap.cl

similarity index 98%

rename from modules/ocl/src/kernels/imgproc_remap.cl

rename to modules/ocl/src/opencl/imgproc_remap.cl

index 4917749..ee40e93 100644 (file)
--- a/modules/ocl/src/kernels/imgproc_remap.cl
+++ b/modules/ocl/src/opencl/imgproc_remap.cl
@@ -48,7 +48,7 @@
  #if defined DOUBLE_SUPPORT
  #pragma OPENCL EXTENSION cl_khr_fp64:enable
  typedef double4 F4 ;
-#else 
+#else
  typedef float4 F4;
  #endif
  
@@ -62,7 +62,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
-     
+
      if(x < threadCols && y < dst_rows)
      {
          x = x << 2;
@@ -79,7 +79,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
  
          map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
          int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
-   
+
          uchar4 con = convert_uchar4(convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows) || convert_int4(map1_data.even) < (int4)(0) || convert_int4(map1_data.odd) < (int4)(0));
          uchar4 src_data = val;
  
@@ -91,12 +91,12 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig
          src_data.s2 = *(src + srcIdx.s2);
          if (con.s3 == 0)
          src_data.s3 = *(src + srcIdx.s3);
-        
+
          uchar4 dst_data;
- 
+
          __global uchar4* d = (__global uchar4 *)(dst + dstStart);
  
-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;
  
          int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
          dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
@@ -113,7 +113,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
-     
+
      if(x < threadCols && y < dst_rows)
      {
          x = x << 2;
@@ -131,9 +131,9 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
          map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
          int8 map1_dataZ = convert_int8_sat_rte(map1_data);
          int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset;
-    
+
          uchar4 src_data = val;
-        uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0)); 
+        uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0));
  
          if (con.s0 == 0)
          src_data.s0 = *(src + srcIdx.s0);
@@ -147,10 +147,10 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig
         // dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
          __global uchar4* d = (__global uchar4 *)(dst + dstStart);
  
-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;
  
          int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-  
+
          dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
          *d = dst_data;
      }
@@ -162,7 +162,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
-     
+
      if(x < threadCols && y < dst_rows)
      {
          x = x << 2;
@@ -183,9 +183,9 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
          float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
          int8 map_dataZ = convert_int8_sat_rte(map_data);
          int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset;
-     
+
          uchar4 src_data = val;
-        uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0)); 
+        uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0));
  
          if (con.s0 == 0)
          src_data.s0 = *(src + srcIdx.s0);
@@ -196,14 +196,14 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
          if (con.s3 == 0)
          src_data.s3 = *(src + srcIdx.s3);
          uchar4 dst_data;
-    
+
      //    dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
          __global uchar4* d = (__global uchar4 *)(dst + dstStart);
  
-        uchar4 dVal = *d;      
+        uchar4 dVal = *d;
  
          int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-  
+
          dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
          *d = dst_data;
      }
@@ -272,7 +272,7 @@ __kernel void remapNNF1Constant_C4_D0(__global unsigned char* dst, __global unsi
      int y = get_global_id(1);
  
      if(x < threadCols && y < dst_rows)
-    { 
+    {
           int dstIdx = y * dst_step + (x << 2) + dst_offset;
          int mapIdx = y * map1_step + (x << 2) + map1_offset;
          float map1_data = *((__global float *)((__global char*)map1 + mapIdx));
@@ -294,7 +294,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
-     
+
      if(x < threadCols && y < dst_rows)
      {
          int dstIdx = y * dst_step + (x << 2) + dst_offset;
@@ -309,7 +309,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const *
             src_data = *((__global float *)((__global uchar *)src + srcIdx));
          *((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
  
- 
+
      }
  
  
@@ -321,7 +321,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
-     
+
      if(x < threadCols && y < dst_rows)
      {
          int dstIdx = y * dst_step + (x << 2) + dst_offset;
@@ -337,7 +337,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const *
             src_data = *((__global float *)((__global uchar *)src + srcIdx));
          *((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
  
- 
+
      }
  
  }
@@ -348,7 +348,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
-     
+
      if(x < threadCols && y < dst_rows)
      {
          int dstIdx = y * dst_step + (x << 2) + dst_offset;
@@ -367,7 +367,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const
             src_data = *((__global float *)((__global uchar *)src + srcIdx));
          *((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
  
- 
+
      }
  
  }
@@ -391,9 +391,9 @@ __kernel void remapNNSConstant_C4_D5(__global float * dst, __global float const
            src_data = nval;
        else
            src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
-      *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data; 
+      *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data;
+
  
-      
      }
  }
  
@@ -454,13 +454,13 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
      int y = get_global_id(1);
      if(x < threadCols && y < dst_rows)
      {
-      x = x << 2; 
+      x = x << 2;
        int gx = x - (dst_offset&3);
        int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
  
        uchar4 nval =convert_uchar4(nVal);
        uchar4 val = (uchar4)(nval.s0);
-  
+
  
        int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&3);
  
@@ -518,12 +518,12 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig
            d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
        if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
            d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
- 
+
        uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-    
+
        __global uchar4* D = (__global uchar4 *)(dst + dstStart);
  
-      uchar4 dVal = *D;      
+      uchar4 dVal = *D;
        int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
        dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
  
@@ -540,13 +540,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
      int y = get_global_id(1);
      if(x < threadCols && y < dst_rows)
      {
-      x = x << 2; 
+      x = x << 2;
        int gx = x - (dst_offset&3);
        int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
  
        uchar4 nval =convert_uchar4(nVal);
        uchar4 val = (uchar4)(nval.s0);
-  
+
  
        int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&3);
  
@@ -607,13 +607,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi
            d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
        if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
            d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
- 
+
  
        uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-    
+
        __global uchar4* D = (__global uchar4 *)(dst + dstStart);
  
-      uchar4 dVal = *D;      
+      uchar4 dVal = *D;
        int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
        dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
  
@@ -725,13 +725,13 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
      int y = get_global_id(1);
      if(x < threadCols && y < dst_rows)
      {
-      x = x << 4; 
+      x = x << 4;
        int gx = x - (dst_offset&15);
        int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
  
        float4 nval =convert_float4(nVal);
        float4 val = (float4)(nval.s0);
-  
+
        int dstStart = (y * dst_step + x  + dst_offset) - (dst_offset&15);
        int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
        float8 map1_data;
@@ -787,12 +787,12 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const *
            d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
        if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
            d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
-    
+
        float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-    
+
        __global float4* D = (__global float4 *)((__global char*)dst + dstStart);
  
-      float4 dVal = *D;      
+      float4 dVal = *D;
        int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
        dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
  
@@ -809,13 +809,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
      int y = get_global_id(1);
      if(x < threadCols && y < dst_rows)
      {
-      x = x << 4; 
+      x = x << 4;
        int gx = x - (dst_offset&15);
        int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
  
        float4 nval =convert_float4(nVal);
        float4 val = (float4)(nval.s0);
-  
+
        int dstStart = y * dst_step + x  + dst_offset - (dst_offset & 15);
        int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15);
        float4 map1_data;
@@ -874,13 +874,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const
            d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
        if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
            d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
- 
-      
+
+
        float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-    
+
        __global float4* D = (__global float4 *)((__global char*)dst + dstStart);
  
-      float4 dVal = *D;      
+      float4 dVal = *D;
        int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
        dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
  
@@ -928,7 +928,7 @@ __kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const
        else
        d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
  
-      float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); 
+      float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
        *((__global float4 *)((__global uchar*)dst + dstIdx)) =  dst_data ;
  
      }
@@ -974,12 +974,9 @@ __kernel void remapLNF1Constant_C4_D5(__global float * dst, __global float const
        else
        d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
  
-      float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); 
+      float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
        *((__global float4 *)((__global uchar*)dst + dstIdx)) =  dst_data ;
  
  
      }
  }
-
-
-
diff --git a/modules/ocl/src/kernels/imgproc_resize.cl b/modules/ocl/src/opencl/imgproc_resize.cl

similarity index 99%

rename from modules/ocl/src/kernels/imgproc_resize.cl

rename to modules/ocl/src/opencl/imgproc_resize.cl

index b6a25d3..fd486de 100644 (file)
--- a/modules/ocl/src/kernels/imgproc_resize.cl
+++ b/modules/ocl/src/opencl/imgproc_resize.cl
@@ -411,4 +411,3 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
          dst[dpos] = src[spos];
  
  }
-
diff --git a/modules/ocl/src/kernels/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl

similarity index 99%

rename from modules/ocl/src/kernels/imgproc_threshold.cl

rename to modules/ocl/src/opencl/imgproc_threshold.cl

index e046b49..8ad501f 100644 (file)
--- a/modules/ocl/src/kernels/imgproc_threshold.cl
+++ b/modules/ocl/src/opencl/imgproc_threshold.cl
@@ -150,4 +150,3 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa
          }
      }
  }
-
diff --git a/modules/ocl/src/kernels/imgproc_warpAffine.cl b/modules/ocl/src/opencl/imgproc_warpAffine.cl

similarity index 100%

rename from modules/ocl/src/kernels/imgproc_warpAffine.cl

rename to modules/ocl/src/opencl/imgproc_warpAffine.cl
diff --git a/modules/ocl/src/kernels/imgproc_warpPerspective.cl b/modules/ocl/src/opencl/imgproc_warpPerspective.cl

similarity index 99%

rename from modules/ocl/src/kernels/imgproc_warpPerspective.cl

rename to modules/ocl/src/opencl/imgproc_warpPerspective.cl

index 9a5ec83..a37ffa1 100644 (file)
--- a/modules/ocl/src/kernels/imgproc_warpPerspective.cl
+++ b/modules/ocl/src/opencl/imgproc_warpPerspective.cl
@@ -682,4 +682,3 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4
          }
     }
  }
-
diff --git a/modules/ocl/src/kernels/interpolate_frames.cl b/modules/ocl/src/opencl/interpolate_frames.cl

similarity index 100%

rename from modules/ocl/src/kernels/interpolate_frames.cl

rename to modules/ocl/src/opencl/interpolate_frames.cl
diff --git a/modules/ocl/src/kernels/match_template.cl b/modules/ocl/src/opencl/match_template.cl

similarity index 99%

rename from modules/ocl/src/kernels/match_template.cl

rename to modules/ocl/src/opencl/match_template.cl

index ddbd86b..3133e62 100644 (file)
--- a/modules/ocl/src/kernels/match_template.cl
+++ b/modules/ocl/src/opencl/match_template.cl
@@ -821,4 +821,3 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0
          res[res_idx] = normAcc(num, denum);
      }
  }
-
diff --git a/modules/ocl/src/kernels/meanShift.cl b/modules/ocl/src/opencl/meanShift.cl

similarity index 99%

rename from modules/ocl/src/kernels/meanShift.cl

rename to modules/ocl/src/opencl/meanShift.cl

index 4b5a08b..a5b1108 100644 (file)
--- a/modules/ocl/src/kernels/meanShift.cl
+++ b/modules/ocl/src/opencl/meanShift.cl
@@ -240,4 +240,3 @@ __kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr,
  //        outsp[basesp] =(short2)((short)x0,(short)y0);
      }
  }
-
diff --git a/modules/ocl/src/kernels/merge_mat.cl b/modules/ocl/src/opencl/merge_mat.cl

similarity index 100%

rename from modules/ocl/src/kernels/merge_mat.cl

rename to modules/ocl/src/opencl/merge_mat.cl
diff --git a/modules/ocl/src/kernels/moments.cl b/modules/ocl/src/opencl/moments.cl

similarity index 99%

rename from modules/ocl/src/kernels/moments.cl

rename to modules/ocl/src/opencl/moments.cl

index 6048837..399ff32 100644 (file)
--- a/modules/ocl/src/kernels/moments.cl
+++ b/modules/ocl/src/opencl/moments.cl
@@ -27,7 +27,7 @@ typedef long T;
  #define DST_ROW_A03     9
  
  __kernel void icvContourMoments(int contour_total,
-                                __global float* reader_oclmat_data, 
+                                __global float* reader_oclmat_data,
                                  __global T* dst_a,
                                  int dst_step)
  {
@@ -58,7 +58,7 @@ __kernel void icvContourMoments(int contour_total,
      dxy = xi_1 * yi - xi * yi_1;
      xii_1 = xi_1 + xi;
      yii_1 = yi_1 + yi;
-    
+
      dst_step /= sizeof(T);
      *( dst_a + DST_ROW_A00 * dst_step + idx) = dxy;
      *( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1;
diff --git a/modules/ocl/src/kernels/nonfree_surf.cl b/modules/ocl/src/opencl/nonfree_surf.cl

similarity index 94%

rename from modules/ocl/src/kernels/nonfree_surf.cl

rename to modules/ocl/src/opencl/nonfree_surf.cl

index 8cffe3d..8c373bc 100644 (file)
--- a/modules/ocl/src/kernels/nonfree_surf.cl
+++ b/modules/ocl/src/opencl/nonfree_surf.cl
@@ -104,11 +104,11 @@ __constant sampler_t sampler    = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
  // N = 2
  // for simple haar paatern
  float icvCalcHaarPatternSum_2(
-    IMAGE_INT32 sumTex, 
-    __constant float src[2][5], 
-    int oldSize, 
-    int newSize, 
-    int y, int x, 
+    IMAGE_INT32 sumTex,
+    __constant float src[2][5],
+    int oldSize,
+    int newSize,
+    int y, int x,
      int rows, int cols, int elemPerRow)
  {
  
@@ -137,11 +137,11 @@ float icvCalcHaarPatternSum_2(
  
  // N = 3
  float icvCalcHaarPatternSum_3(
-    IMAGE_INT32 sumTex, 
-    __constant float src[2][5], 
-    int oldSize, 
-    int newSize, 
-    int y, int x, 
+    IMAGE_INT32 sumTex,
+    __constant float src[2][5],
+    int oldSize,
+    int newSize,
+    int y, int x,
      int rows, int cols, int elemPerRow)
  {
  
@@ -170,11 +170,11 @@ float icvCalcHaarPatternSum_3(
  
  // N = 4
  float icvCalcHaarPatternSum_4(
-    IMAGE_INT32 sumTex, 
-    __constant float src[2][5], 
-    int oldSize, 
-    int newSize, 
-    int y, int x, 
+    IMAGE_INT32 sumTex,
+    __constant float src[2][5],
+    int oldSize,
+    int newSize,
+    int y, int x,
      int rows, int cols, int elemPerRow)
  {
  
@@ -265,7 +265,7 @@ __kernel void icvCalcLayerDetAndTrace(
          const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
  
          det  [j + margin + det_step   * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
-        trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; 
+        trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
      }
  }
  
@@ -301,9 +301,9 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
  // Non-maximal suppression to further filtering the candidates from previous step
  __kernel
      void icvFindMaximaInLayer_withmask(
-    __global const float * det, 
-    __global const float * trace, 
-    __global int4 * maxPosBuffer, 
+    __global const float * det,
+    __global const float * trace,
+    __global int4 * maxPosBuffer,
      volatile __global int* maxCounter,
      int counter_offset,
      int det_step,     // the step of det in bytes
@@ -345,26 +345,26 @@ __kernel
      // Is this thread within the hessian buffer?
      const int zoff = get_local_size(0) * get_local_size(1);
      const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
-    N9[localLin - zoff] = 
-        det[det_step * 
+    N9[localLin - zoff] =
+        det[det_step *
          (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
          + min(max(j, 0), c_img_cols - 1)];                            // x
-    N9[localLin       ] = 
-        det[det_step * 
+    N9[localLin       ] =
+        det[det_step *
          (c_layer_rows * (layer    ) + min(max(i, 0), c_img_rows - 1)) // y
          + min(max(j, 0), c_img_cols - 1)];                            // x
-    N9[localLin + zoff] = 
-        det[det_step * 
+    N9[localLin + zoff] =
+        det[det_step *
          (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
          + min(max(j, 0), c_img_cols - 1)];                            // x
  
      barrier(CLK_LOCAL_MEM_FENCE);
  
-    if (i < c_layer_rows - margin 
+    if (i < c_layer_rows - margin
          && j < c_layer_cols - margin
-        && get_local_id(0) > 0 
+        && get_local_id(0) > 0
          && get_local_id(0) < get_local_size(0) - 1
-        && get_local_id(1) > 0 
+        && get_local_id(1) > 0
          && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
          )
      {
@@ -429,9 +429,9 @@ __kernel
  
  __kernel
      void icvFindMaximaInLayer(
-    __global float * det, 
-    __global float * trace, 
-    __global int4 * maxPosBuffer, 
+    __global float * det,
+    __global float * trace,
+    __global int4 * maxPosBuffer,
      volatile __global  int* maxCounter,
      int counter_offset,
      int det_step,     // the step of det in bytes
@@ -474,19 +474,19 @@ __kernel
      int l_x = min(max(j, 0), c_img_cols - 1);
      int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
  
-    N9[localLin - zoff] = 
+    N9[localLin - zoff] =
          det[det_step * (l_y - c_layer_rows) + l_x];
-    N9[localLin       ] = 
+    N9[localLin       ] =
          det[det_step * (l_y               ) + l_x];
-    N9[localLin + zoff] = 
+    N9[localLin + zoff] =
          det[det_step * (l_y + c_layer_rows) + l_x];
      barrier(CLK_LOCAL_MEM_FENCE);
  
-    if (i < c_layer_rows - margin 
+    if (i < c_layer_rows - margin
          && j < c_layer_cols - margin
-        && get_local_id(0) > 0 
+        && get_local_id(0) > 0
          && get_local_id(0) < get_local_size(0) - 1
-        && get_local_id(1) > 0 
+        && get_local_id(1) > 0
          && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
          )
      {
@@ -554,17 +554,17 @@ inline bool solve3x3_float(volatile __local  const float A[3][3], volatile __loc
      {
          F invdet = 1.0 / det;
  
-        x[0] = invdet * 
+        x[0] = invdet *
              (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
              A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
              A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   ));
  
-        x[1] = invdet * 
+        x[1] = invdet *
              (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
              b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
              A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0]));
  
-        x[2] = invdet * 
+        x[2] = invdet *
              (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
              A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
              b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
@@ -585,9 +585,9 @@ inline bool solve3x3_float(volatile __local  const float A[3][3], volatile __loc
  
  ////////////////////////////////////////////////////////////////////////
  // INTERPOLATION
-__kernel 
+__kernel
      void icvInterpolateKeypoint(
-    __global const float * det, 
+    __global const float * det,
      __global const int4 * maxPosBuffer,
      __global float * keypoints,
      volatile __global  int * featureCounter,
@@ -617,7 +617,7 @@ __kernel
  
      volatile __local  float N9[3][3][3];
  
-    N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = 
+    N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
          det[det_step * (c_layer_rows * layer + i) + j];
      barrier(CLK_LOCAL_MEM_FENCE);
  
@@ -715,27 +715,27 @@ __kernel
  
  __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
  __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
-__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 
-    0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 
-    0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 
-    0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 
-    0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 
-    0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 
-    0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 
-    0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 
-    0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 
-    0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 
-    0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 
-    0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 
-    0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 
-    0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 
+__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
+    0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
+    0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
+    0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
+    0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
+    0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
+    0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
+    0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
+    0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
+    0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
+    0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
+    0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
+    0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
+    0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
      0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
-    0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 
-    0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 
-    0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 
+    0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
+    0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
+    0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
      0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
-    0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 
-    0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 
+    0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
+    0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
      0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
      0.001707611023448408f, 0.001455130288377404f};
  
@@ -748,13 +748,13 @@ void reduce_32_sum(volatile __local  float * data, volatile float* partial_reduc
      data[tid] = *partial_reduction;
      barrier(CLK_LOCAL_MEM_FENCE);
  
-    if (tid < 16) 
+    if (tid < 16)
      {
          data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
          data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
          data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
          data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
-        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]); 
+        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
      }
  #undef op
  }
@@ -958,8 +958,8 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] =
  
  // utility for linear filter
  inline uchar readerGet(
-    IMAGE_INT8 src, 
-    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, 
+    IMAGE_INT8 src,
+    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
      int i, int j, int rows, int cols, int elemPerRow
      )
  {
@@ -969,8 +969,8 @@ inline uchar readerGet(
  }
  
  inline float linearFilter(
-    IMAGE_INT8 src, 
-    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,  
+    IMAGE_INT8 src,
+    const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
      float y, float x, int rows, int cols, int elemPerRow
      )
  {
@@ -1004,9 +1004,9 @@ void calc_dx_dy(
      volatile __local  float s_dx_bin[25],
      volatile __local  float s_dy_bin[25],
      volatile __local  float s_PATCH[6][6],
-    __global const float* featureX, 
-    __global const float* featureY, 
-    __global const float* featureSize, 
+    __global const float* featureX,
+    __global const float* featureY,
+    __global const float* featureSize,
      __global const float* featureDir,
      int rows,
      int cols,
@@ -1058,26 +1058,26 @@ void calc_dx_dy(
          const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
  
          const float vx = (
-            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1] - 
-            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] + 
-            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - 
-            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ]) 
+            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1] -
+            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] +
+            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
+            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ])
              * dw;
          const float vy = (
-            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ] - 
-            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] + 
-            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - 
-            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1]) 
+            s_PATCH[get_local_id(1) + 1][get_local_id(0)    ] -
+            s_PATCH[get_local_id(1)    ][get_local_id(0)    ] +
+            s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
+            s_PATCH[get_local_id(1)    ][get_local_id(0) + 1])
              * dw;
          s_dx_bin[tid] = vx;
          s_dy_bin[tid] = vy;
      }
  }
  void reduce_sum25(
-    volatile __local  float* sdata1, 
-    volatile __local  float* sdata2, 
-    volatile __local  float* sdata3, 
-    volatile __local  float* sdata4, 
+    volatile __local  float* sdata1,
+    volatile __local  float* sdata2,
+    volatile __local  float* sdata3,
+    volatile __local  float* sdata4,
      int tid
      )
  {
@@ -1115,13 +1115,13 @@ void reduce_sum25(
      }
  }
  
-__kernel 
+__kernel
      void compute_descriptors64(
      IMAGE_INT8 imgTex,
-    volatile __global float * descriptors, 
+    volatile __global float * descriptors,
      __global const float * keypoints,
      int descriptors_step,
-    int keypoints_step, 
+    int keypoints_step,
      int rows,
      int cols,
      int img_step
@@ -1155,7 +1155,7 @@ __kernel
      if (tid < 25)
      {
          reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-    }    
+    }
      barrier(CLK_LOCAL_MEM_FENCE);
      if (tid < 25)
      {
@@ -1171,10 +1171,10 @@ __kernel
          }
      }
  }
-__kernel 
+__kernel
      void compute_descriptors128(
      IMAGE_INT8 imgTex,
-    __global volatile float * descriptors, 
+    __global volatile float * descriptors,
      __global float * keypoints,
      int descriptors_step,
      int keypoints_step,
@@ -1269,7 +1269,7 @@ __kernel
      }
  }
  
-__kernel 
+__kernel
      void normalize_descriptors128(__global float * descriptors, int descriptors_step)
  {
      descriptors_step /= sizeof(*descriptors);
@@ -1310,7 +1310,7 @@ __kernel
      // normalize and store in output
      descriptor_base[get_local_id(0)] = lookup / len;
  }
-__kernel 
+__kernel
      void normalize_descriptors64(__global float * descriptors, int descriptors_step)
  {
      descriptors_step /= sizeof(*descriptors);
diff --git a/modules/ocl/src/kernels/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl

similarity index 100%

rename from modules/ocl/src/kernels/objdetect_hog.cl

rename to modules/ocl/src/opencl/objdetect_hog.cl
diff --git a/modules/ocl/src/kernels/operator_convertTo.cl b/modules/ocl/src/opencl/operator_convertTo.cl

similarity index 100%

rename from modules/ocl/src/kernels/operator_convertTo.cl

rename to modules/ocl/src/opencl/operator_convertTo.cl
diff --git a/modules/ocl/src/kernels/operator_copyToM.cl b/modules/ocl/src/opencl/operator_copyToM.cl

similarity index 100%

rename from modules/ocl/src/kernels/operator_copyToM.cl

rename to modules/ocl/src/opencl/operator_copyToM.cl
diff --git a/modules/ocl/src/kernels/operator_setTo.cl b/modules/ocl/src/opencl/operator_setTo.cl

similarity index 100%

rename from modules/ocl/src/kernels/operator_setTo.cl

rename to modules/ocl/src/opencl/operator_setTo.cl
diff --git a/modules/ocl/src/kernels/operator_setToM.cl b/modules/ocl/src/opencl/operator_setToM.cl

similarity index 99%

rename from modules/ocl/src/kernels/operator_setToM.cl

rename to modules/ocl/src/opencl/operator_setToM.cl

index 59357fa..dde12d8 100644 (file)
--- a/modules/ocl/src/kernels/operator_setToM.cl
+++ b/modules/ocl/src/opencl/operator_setToM.cl
@@ -57,4 +57,3 @@ __kernel void set_to_with_mask(
          }
  
  }
-
diff --git a/modules/ocl/src/kernels/pyr_down.cl b/modules/ocl/src/opencl/pyr_down.cl

similarity index 100%

rename from modules/ocl/src/kernels/pyr_down.cl

rename to modules/ocl/src/opencl/pyr_down.cl
diff --git a/modules/ocl/src/kernels/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl

similarity index 100%

rename from modules/ocl/src/kernels/pyr_up.cl

rename to modules/ocl/src/opencl/pyr_up.cl
diff --git a/modules/ocl/src/kernels/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl

similarity index 100%

rename from modules/ocl/src/kernels/pyrlk.cl

rename to modules/ocl/src/opencl/pyrlk.cl
diff --git a/modules/ocl/src/kernels/pyrlk_no_image.cl b/modules/ocl/src/opencl/pyrlk_no_image.cl

similarity index 100%

rename from modules/ocl/src/kernels/pyrlk_no_image.cl

rename to modules/ocl/src/opencl/pyrlk_no_image.cl
diff --git a/modules/ocl/src/kernels/split_mat.cl b/modules/ocl/src/opencl/split_mat.cl

similarity index 87%

rename from modules/ocl/src/kernels/split_mat.cl

rename to modules/ocl/src/opencl/split_mat.cl

index 3c70859..caee436 100644 (file)
--- a/modules/ocl/src/kernels/split_mat.cl
+++ b/modules/ocl/src/opencl/split_mat.cl
@@ -51,9 +51,9 @@
  ////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
  ////////////////////////////////////////////////////////////////////////////////////////////////
  __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global uchar *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
                                    __global uchar *mat_dst3, int dst3_step, int dst3_offset,
                                    int rows, int cols, int dst_step1)
  
@@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int s
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 2;
  
-        int src_idx  = mad24(y, src_step, src_offset + (x << 2)); 
+        int src_idx  = mad24(y, src_step, src_offset + (x << 2));
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
  
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
          int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
          int dst2_idx   = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
  
-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
          int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
          int dst3_idx   = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
-           
-        uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx))); 
-        uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8  >= 0 ? src_idx - 8  : src_idx))); 
-        uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4  >= 0 ? src_idx - 4  : src_idx))); 
-        uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 )); 
  
-        int total_bytes = src_offset + rows * src_step; 
-        uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4  < total_bytes ? src_idx + 4  : src_idx))); 
-        uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8  < total_bytes ? src_idx + 8  : src_idx))); 
-        uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));  
+        uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
+        uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8  >= 0 ? src_idx - 8  : src_idx)));
+        uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4  >= 0 ? src_idx - 4  : src_idx)));
+        uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
+
+        int total_bytes = src_offset + rows * src_step;
+        uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4  < total_bytes ? src_idx + 4  : src_idx)));
+        uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8  < total_bytes ? src_idx + 8  : src_idx)));
+        uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
  
          uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
  
@@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src,  int src_step,  int s
  }
  
  __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global uchar *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global uchar *mat_dst2, int dst2_step, int dst2_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 2;
  
-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);
  
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
          int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
          int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-           
+
          uchar4 dst0_data  = *((__global uchar4 *)(mat_dst0 + dst0_idx));
          uchar4 dst1_data  = *((__global uchar4 *)(mat_dst1 + dst1_idx));
          uchar4 dst2_data  = *((__global uchar4 *)(mat_dst2 + dst2_idx));
@@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int s
  
          uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
          int index = 3 - dst0_offset & 3;
-        tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]); 
+        tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
  
          uchar4 data0, data1, data2;
-        
+
          data0     = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
          data1     = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
          data2     = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -263,33 +263,33 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src,  int src_step,  int s
  }
  
  __kernel void split_vector_C2_D0 (__global uchar *mat_src,  int src_step,  int src_offset,
-                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global uchar *mat_dst1, int dst1_step, int dst1_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 2;
  
          #define dst0_align ((dst0_offset & 3) << 1)
          #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-           
-               int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-               int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+
+        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
          uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
          uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
          if(src_idx_0 == -6)
@@ -326,9 +326,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src,  int src_step,  int s
  }
  
  __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global char *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
                                    __global char *mat_dst3, int dst3_step, int dst3_offset,
                                    int rows, int cols, int dst_step1)
  
@@ -336,35 +336,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int sr
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 2;
  
-        int src_idx  = mad24(y, src_step, src_offset + (x << 2)); 
+        int src_idx  = mad24(y, src_step, src_offset + (x << 2));
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
  
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
          int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
          int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
  
-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
          int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
          int dst3_idx   = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
-           
-        char4 data_0 = *((global char4 *)(mat_src + src_idx - 12)); 
-        char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 )); 
-        char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 )); 
-        char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 )); 
-        char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 )); 
-        char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 )); 
-        char4 data_6 = *((global char4 *)(mat_src + src_idx + 12)); 
+
+        char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
+        char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
+        char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
+        char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
+        char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
+        char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
+        char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
  
          char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
  
@@ -437,33 +437,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src,  int src_step,  int sr
  }
  
  __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global char *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global char *mat_dst2, int dst2_step, int dst2_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 2;
  
-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + x  & (int)0xfffffffc);
  
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
          int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
          int dst2_idx   = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-           
+
          char4 dst0_data  = *((__global char4 *)(mat_dst0 + dst0_idx));
          char4 dst1_data  = *((__global char4 *)(mat_dst1 + dst1_idx));
          char4 dst2_data  = *((__global char4 *)(mat_dst2 + dst2_idx));
@@ -500,10 +500,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int sr
  
          char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
          int index = 3 - dst0_offset & 3;
-        tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]); 
+        tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
  
          char4 data0, data1, data2;
-        
+
          data0     = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
          data1     = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13)  : data0;
          data2     = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@@ -536,32 +536,32 @@ __kernel void split_vector_C3_D1 (__global char *mat_src,  int src_step,  int sr
  }
  
  __kernel void split_vector_C2_D1 (__global char *mat_src,  int src_step,  int src_offset,
-                                  __global char *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global char *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global char *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global char *mat_dst1, int dst1_step, int dst1_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 2;
  
          #define dst0_align ((dst0_offset & 3) << 1)
          #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 1));
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-       int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-               int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+    int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
          char8 src_data_0 = vload8(0, mat_src + src_idx_0);
          char8 src_data_1 = vload8(0, mat_src + src_idx_1);
          if(src_idx_0 == -6)
@@ -597,9 +597,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src,  int src_step,  int sr
  }
  
  __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global ushort *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
                                    __global ushort *mat_dst3, int dst3_step, int dst3_offset,
                                    int rows, int cols, int dst_step1)
  
@@ -607,30 +607,30 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 1;
  
-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8); 
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8); 
+        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
+        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
          int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
          int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
          int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
          int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-           
-       int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+
+    int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
          ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
               if(src_idx_0 == -6)
              src_data0.s01234567 = src_data0.s67012345;
@@ -672,33 +672,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src,  int src_step,  int
  }
  
  __kernel void split_vector_C3_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global ushort *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global ushort *mat_dst2, int dst2_step, int dst2_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 1;
  
-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
          int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
          int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-           
+
          ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
          ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
          ushort2 dst2_data  = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -735,48 +735,48 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src,  int src_step,  int
  }
  
  __kernel void split_vector_C2_D2 (__global ushort *mat_src,  int src_step,  int src_offset,
-                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global ushort *mat_dst1, int dst1_step, int dst1_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 1;
  
          #define dst0_align ((dst0_offset & 3) << 1)
          #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-           
-               int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-               int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+
+        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
          ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
          ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
-               if(src_idx_0 < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
-                       src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
-               }
-               if(src_idx_1 < 0)
-               {
-                       ushort4 tmp;
-                       tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
-                       src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
-               }               
-  
+        if(src_idx_0 < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+            src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
+        }
+        if(src_idx_1 < 0)
+        {
+            ushort4 tmp;
+            tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
+            src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
+        }
+
          ushort2 dst0_data  = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
          ushort2 dst1_data  = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
  
@@ -793,9 +793,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src,  int src_step,  int
      }
  }
  __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global short *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
                                    __global short *mat_dst3, int dst3_step, int dst3_offset,
                                    int rows, int cols, int dst_step1)
  
@@ -803,38 +803,38 @@ __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int s
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 1;
  
-        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8); 
-        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8); 
+        int src_idx_0  = mad24(y, src_step, src_offset + (x << 3) - 8);
+        int src_idx_1  = mad24(y, src_step, src_offset + (x << 3) + 8);
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
          int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
          int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst3_start = mad24(y, dst3_step, dst3_offset); 
+        int dst3_start = mad24(y, dst3_step, dst3_offset);
          int dst3_end   = mad24(y, dst3_step, dst3_offset + dst_step1);
          int dst3_idx   = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-       int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
          short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
- 
+
          if(src_idx_0 == -6)
              src_data0.s01234567 = src_data0.s67012345;
          if(src_idx_0 == -4)
              src_data0.s01234567 = src_data0.s45670123;
          if(src_idx_0 == -2)
              src_data0.s01234567 = src_data0.s23456701;
-          
+
          short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
  
          short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
@@ -868,33 +868,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src,  int src_step,  int s
      }
  }
  __kernel void split_vector_C3_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global short *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global short *mat_dst2, int dst2_step, int dst2_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 1;
  
-        int src_idx  = mad24(y, src_step, src_offset); 
+        int src_idx  = mad24(y, src_step, src_offset);
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst2_start = mad24(y, dst2_step, dst2_offset); 
+        int dst2_start = mad24(y, dst2_step, dst2_offset);
          int dst2_end   = mad24(y, dst2_step, dst2_offset + dst_step1);
          int dst2_idx   = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-           
+
          short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
          short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
          short2 dst2_data  = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
@@ -932,47 +932,47 @@ __kernel void split_vector_C3_D3 (__global short *mat_src,  int src_step,  int s
  
  
  __kernel void split_vector_C2_D3 (__global short *mat_src,  int src_step,  int src_offset,
-                                  __global short *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global short *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global short *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global short *mat_dst1, int dst1_step, int dst1_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
          x = x << 1;
  
          #define dst0_align ((dst0_offset & 3) << 1)
          #define dst1_align ((dst1_offset & 3) << 1)
-        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2)); 
-        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2)); 
+        int src_idx_0  = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+        int src_idx_1  = mad24(y, src_step, src_offset - dst1_align + (x << 2));
  
-        int dst0_start = mad24(y, dst0_step, dst0_offset); 
+        int dst0_start = mad24(y, dst0_step, dst0_offset);
          int dst0_end   = mad24(y, dst0_step, dst0_offset + dst_step1);
          int dst0_idx   = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
  
-        int dst1_start = mad24(y, dst1_step, dst1_offset); 
+        int dst1_start = mad24(y, dst1_step, dst1_offset);
          int dst1_end   = mad24(y, dst1_step, dst1_offset + dst_step1);
          int dst1_idx   = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-               int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
-               int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+        int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+        int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
          short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
          short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
-               if(src_idx_0 < 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
-                       src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
-               }
-               if(src_idx_1< 0)
-               {
-                       short4 tmp;
-                       tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
-                       src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
-               }               
-             
+        if(src_idx_0 < 0)
+        {
+            short4 tmp;
+            tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+            src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+        }
+        if(src_idx_1< 0)
+        {
+            short4 tmp;
+            tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
+            src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
+        }
+
  
          short2 dst0_data  = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
          short2 dst1_data  = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
@@ -990,9 +990,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src,  int src_step,  int s
      }
  }
  __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global int *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
                                    __global int *mat_dst3, int dst3_step, int dst3_offset,
                                    int rows, int cols, int dst_step1)
  
@@ -1000,14 +1000,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
          int dst0_idx = mad24(y, dst0_step, dst0_offset);
          int dst1_idx = mad24(y, dst1_step, dst1_offset);
          int dst2_idx = mad24(y, dst2_step, dst2_offset);
          int dst3_idx = mad24(y, dst3_step, dst3_offset);
-           
+
          int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
  
          ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1017,18 +1017,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src,  int src_step,  int src
      }
  }
  __kernel void split_vector_C3_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global int *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global int *mat_dst2, int dst2_step, int dst2_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
          int dst0_idx = mad24(y, dst0_step, dst0_offset);
          int dst1_idx = mad24(y, dst1_step, dst1_offset);
          int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1044,20 +1044,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src,  int src_step,  int src
  }
  
  __kernel void split_vector_C2_D4 (__global int *mat_src,  int src_step,  int src_offset,
-                                  __global int *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global int *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global int *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global int *mat_dst1, int dst1_step, int dst1_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
          int dst0_idx = mad24(y, dst0_step, dst0_offset);
          int dst1_idx = mad24(y, dst1_step, dst1_offset);
-           
+
          int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
  
          ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1066,9 +1066,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src,  int src_step,  int src
  }
  
  __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global float *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
                                    __global float *mat_dst3, int dst3_step, int dst3_offset,
                                    int rows, int cols, int dst_step1)
  
@@ -1076,14 +1076,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int s
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
          int dst0_idx = mad24(y, dst0_step, dst0_offset);
          int dst1_idx = mad24(y, dst1_step, dst1_offset);
          int dst2_idx = mad24(y, dst2_step, dst2_offset);
          int dst3_idx = mad24(y, dst3_step, dst3_offset);
-           
+
          float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
  
          ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1094,18 +1094,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src,  int src_step,  int s
  }
  
  __kernel void split_vector_C3_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global float *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global float *mat_dst2, int dst2_step, int dst2_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
          int dst0_idx = mad24(y, dst0_step, dst0_offset);
          int dst1_idx = mad24(y, dst1_step, dst1_offset);
          int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1121,20 +1121,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src,  int src_step,  int s
  }
  
  __kernel void split_vector_C2_D5 (__global float *mat_src,  int src_step,  int src_offset,
-                                  __global float *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global float *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global float *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global float *mat_dst1, int dst1_step, int dst1_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
          int dst0_idx = mad24(y, dst0_step, dst0_offset);
          int dst1_idx = mad24(y, dst1_step, dst1_offset);
-           
+
          float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
  
          ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1144,9 +1144,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src,  int src_step,  int s
  
  #if defined (DOUBLE_SUPPORT)
  __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global double *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
                                    __global double *mat_dst3, int dst3_step, int dst3_offset,
                                    int rows, int cols, int dst_step1)
  
@@ -1154,14 +1154,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
          int dst0_idx = mad24(y, dst0_step, dst0_offset);
          int dst1_idx = mad24(y, dst1_step, dst1_offset);
          int dst2_idx = mad24(y, dst2_step, dst2_offset);
          int dst3_idx = mad24(y, dst3_step, dst3_offset);
-           
+
          double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
  
          ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@@ -1172,18 +1172,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src,  int src_step,  int
  }
  
  __kernel void split_vector_C3_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
-                                       __global double *mat_dst2, int dst2_step, int dst2_offset,  
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
+                                    __global double *mat_dst2, int dst2_step, int dst2_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
          int dst0_idx = mad24(y, dst0_step, dst0_offset);
          int dst1_idx = mad24(y, dst1_step, dst1_offset);
          int dst2_idx = mad24(y, dst2_step, dst2_offset);
@@ -1199,20 +1199,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src,  int src_step,  int
  }
  
  __kernel void split_vector_C2_D6 (__global double *mat_src,  int src_step,  int src_offset,
-                                  __global double *mat_dst0, int dst0_step, int dst0_offset,  
-                                  __global double *mat_dst1, int dst1_step, int dst1_offset, 
+                                  __global double *mat_dst0, int dst0_step, int dst0_offset,
+                                  __global double *mat_dst1, int dst1_step, int dst1_offset,
                                    int rows, int cols, int dst_step1)
  
  {
      int x = get_global_id(0);
      int y = get_global_id(1);
  
-    if((x  < cols) && (y < rows)) 
+    if((x  < cols) && (y < rows))
      {
-        int src_idx  = mad24(y, src_step,  src_offset); 
+        int src_idx  = mad24(y, src_step,  src_offset);
          int dst0_idx = mad24(y, dst0_step, dst0_offset);
          int dst1_idx = mad24(y, dst1_step, dst1_offset);
-           
+
          double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
  
          ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
diff --git a/modules/ocl/src/kernels/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl

similarity index 96%

rename from modules/ocl/src/kernels/stereobm.cl

rename to modules/ocl/src/opencl/stereobm.cl

index 4edab86..9542839 100644 (file)
--- a/modules/ocl/src/kernels/stereobm.cl
+++ b/modules/ocl/src/opencl/stereobm.cl
@@ -55,9 +55,9 @@ int SQ(int a)
      return a * a;
  }
  
-unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache, 
+unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
                       volatile __local unsigned int *col_ssd, int radius)
-{      
+{
      unsigned int cache = 0;
      unsigned int cache2 = 0;
  
@@ -77,7 +77,7 @@ unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
      return col_ssd[0] + cache + cache2;
  }
  
-uint2 MinSSD(volatile __local unsigned int *col_ssd_cache, 
+uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
               volatile __local unsigned int *col_ssd, int radius)
  {
      unsigned int ssd[N_DISPARITIES];
@@ -112,7 +112,7 @@ uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
      return (uint2)(mssd, bestIdx);
  }
  
-void StepDown(int idx1, int idx2, __global unsigned char* imageL, 
+void StepDown(int idx1, int idx2, __global unsigned char* imageL,
                __global unsigned char* imageR, int d, volatile  __local unsigned int *col_ssd, int radius)
  {
      unsigned char leftPixel1;
@@ -179,8 +179,8 @@ void StepDown(int idx1, int idx2, __global unsigned char* imageL,
      col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
  }
  
-void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, 
-                __global unsigned char* imageR, int d, 
+void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
+                __global unsigned char* imageR, int d,
                  volatile __local unsigned int *col_ssd, int radius)
  {
      unsigned char leftPixel1;
@@ -215,15 +215,15 @@ void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imag
      col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7];
  }
  
-__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,  
+__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
                             __global unsigned int *cminSSDImage, int cminSSD_step,
                             __global unsigned char *disp, int disp_step,int cwidth, int cheight,
-                           int img_step, int maxdisp, int radius,  
+                           int img_step, int maxdisp, int radius,
                             __local unsigned int *col_ssd_cache)
  {
  
      volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0);
-    volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;  
+    volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
  
      int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
     // int Y = get_group_id(1) * ROWSperTHREAD + radius;
@@ -266,8 +266,8 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
              int idx1 = y_tex * img_step + x_tex;
              int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex;
  
-            barrier(CLK_GLOBAL_MEM_FENCE); 
-            barrier(CLK_LOCAL_MEM_FENCE); 
+            barrier(CLK_GLOBAL_MEM_FENCE);
+            barrier(CLK_LOCAL_MEM_FENCE);
  
              StepDown(idx1, idx2, left, right, d, col_ssd, radius);
              if (col_ssd_extra > 0)
@@ -276,7 +276,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
  
              y_tex += 1;
  
-            barrier(CLK_LOCAL_MEM_FENCE); 
+            barrier(CLK_LOCAL_MEM_FENCE);
  
              if (X < cwidth - radius && row < cheight - radius - Y)
              {
@@ -296,7 +296,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char
  //////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
  //////////////////////////////////////////////////////////////////////////////////////////////////
  
-__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output, 
+__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output,
                                 int rows, int cols, int prefilterCap)
  {
      int x = get_global_id(0);
@@ -304,7 +304,7 @@ __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned
  
      if(x < cols && y < rows)
      {
-        int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) + 
+        int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) +
                    input[(y)   * cols + (x-1)] * (-2) + input[(y)   * cols + (x+1)] * (2) +
                    input[(y+1) * cols + (x-1)] * (-1) + input[(y+1) * cols + (x+1)] * (1);
  
@@ -325,10 +325,10 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols)
      int x1 = x==0? 0 : x-1;
      if(x < cols && y < rows)
      {
-        conv = (float)input[(y1)  * cols + (x1)] * (-1) + (float)input[(y1)  * cols + (x+1)] * (1) + 
+        conv = (float)input[(y1)  * cols + (x1)] * (-1) + (float)input[(y1)  * cols + (x+1)] * (1) +
                 (float)input[(y)   * cols + (x1)] * (-2) + (float)input[(y)   * cols + (x+1)] * (2) +
                 (float)input[(y+1) * cols + (x1)] * (-1) + (float)input[(y+1) * cols + (x+1)] * (1);
-    
+
      }
      return fabs(conv);
  }
@@ -359,9 +359,9 @@ float CalcSums(__local float *cols, __local float *cols_cache, int winsz)
  }
  
  #define RpT (2 * ROWSperTHREAD)  // got experimentally
-__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols, 
-                                 int disp_step, __global unsigned char *input, int input_rows, 
-                                 int input_cols,int winsz, float threshold, 
+__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols,
+                                 int disp_step, __global unsigned char *input, int input_rows,
+                                 int input_cols,int winsz, float threshold,
                                   __local float *cols_cache)
  {
      int winsz2 = winsz/2;
@@ -405,13 +405,13 @@ __kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, in
  
          for(int y = beg_row + 1; y < end_row; ++y)
          {
-            sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + 
+            sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
                    sobel(input, x - winsz2, y + winsz2, input_rows, input_cols);
              *cols = sum;
  
              if (cols_extra)
              {
-                sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) 
+                sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
                              + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols);
                  *cols_extra = sum_extra;
              }
author	Andrey Kamaev <andrey.kamaev@itseez.com>
	Fri, 15 Mar 2013 19:56:31 +0000 (23:56 +0400)
committer	Andrey Kamaev <andrey.kamaev@itseez.com>
	Thu, 21 Mar 2013 13:57:01 +0000 (17:57 +0400)
CMakeLists.txt		patch \| blob \| history
cmake/OpenCVModule.cmake		patch \| blob \| history
cmake/cl2cpp.cmake	[moved from modules/ocl/cl2cpp.cmake with 100% similarity]	patch \| blob \| history
modules/ocl/CMakeLists.txt		patch \| blob \| history
modules/ocl/src/kernels/brute_force_match.cl	[deleted file]	patch \| blob \| history
modules/ocl/src/opencl/arithm_2_mat.cl	[moved from modules/ocl/src/kernels/arithm_2_mat.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_LUT.cl	[moved from modules/ocl/src/kernels/arithm_LUT.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_absdiff.cl	[moved from modules/ocl/src/kernels/arithm_absdiff.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_add.cl	[moved from modules/ocl/src/kernels/arithm_add.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_addWeighted.cl	[moved from modules/ocl/src/kernels/arithm_addWeighted.cl with 95% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_add_scalar.cl	[moved from modules/ocl/src/kernels/arithm_add_scalar.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_add_scalar_mask.cl	[moved from modules/ocl/src/kernels/arithm_add_scalar_mask.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_and.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_and.cl with 95% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_and_mask.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_and_mask.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_not.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_not.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_or.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_or.cl with 98% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_or_mask.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_or_mask.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_xor.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_xor.cl with 95% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl	[moved from modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_cartToPolar.cl	[moved from modules/ocl/src/kernels/arithm_cartToPolar.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_compare_eq.cl	[moved from modules/ocl/src/kernels/arithm_compare_eq.cl with 74% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_compare_ne.cl	[moved from modules/ocl/src/kernels/arithm_compare_ne.cl with 73% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_div.cl	[moved from modules/ocl/src/kernels/arithm_div.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_exp.cl	[moved from modules/ocl/src/kernels/arithm_exp.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_flip.cl	[moved from modules/ocl/src/kernels/arithm_flip.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_flip_rc.cl	[moved from modules/ocl/src/kernels/arithm_flip_rc.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_log.cl	[moved from modules/ocl/src/kernels/arithm_log.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_magnitude.cl	[moved from modules/ocl/src/kernels/arithm_magnitude.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_magnitudeSqr.cl	[moved from modules/ocl/src/kernels/arithm_magnitudeSqr.cl with 98% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_minMax.cl	[moved from modules/ocl/src/kernels/arithm_minMax.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_minMaxLoc.cl	[moved from modules/ocl/src/kernels/arithm_minMaxLoc.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl	[moved from modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_minMax_mask.cl	[moved from modules/ocl/src/kernels/arithm_minMax_mask.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_mul.cl	[moved from modules/ocl/src/kernels/arithm_mul.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_nonzero.cl	[moved from modules/ocl/src/kernels/arithm_nonzero.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_phase.cl	[moved from modules/ocl/src/kernels/arithm_phase.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_polarToCart.cl	[moved from modules/ocl/src/kernels/arithm_polarToCart.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_pow.cl	[moved from modules/ocl/src/kernels/arithm_pow.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_sub.cl	[moved from modules/ocl/src/kernels/arithm_sub.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_sub_scalar.cl	[moved from modules/ocl/src/kernels/arithm_sub_scalar.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_sub_scalar_mask.cl	[moved from modules/ocl/src/kernels/arithm_sub_scalar_mask.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_sum.cl	[moved from modules/ocl/src/kernels/arithm_sum.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_sum_3.cl	[moved from modules/ocl/src/kernels/arithm_sum_3.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/arithm_transpose.cl	[moved from modules/ocl/src/kernels/arithm_transpose.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/blend_linear.cl	[moved from modules/ocl/src/kernels/blend_linear.cl with 98% similarity]	patch \| blob \| history
modules/ocl/src/opencl/brute_force_match.cl	[new file with mode: 0644]	patch \| blob
modules/ocl/src/opencl/build_warps.cl	[moved from modules/ocl/src/kernels/build_warps.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/convertC3C4.cl	[moved from modules/ocl/src/kernels/convertC3C4.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/cvt_color.cl	[moved from modules/ocl/src/kernels/cvt_color.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/filter_sep_col.cl	[moved from modules/ocl/src/kernels/filter_sep_col.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/filter_sep_row.cl	[moved from modules/ocl/src/kernels/filter_sep_row.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/filtering_boxFilter.cl	[moved from modules/ocl/src/kernels/filtering_boxFilter.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/filtering_laplacian.cl	[moved from modules/ocl/src/kernels/filtering_laplacian.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/filtering_morph.cl	[moved from modules/ocl/src/kernels/filtering_morph.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/haarobjectdetect.cl	[moved from modules/ocl/src/kernels/haarobjectdetect.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/haarobjectdetect_scaled2.cl	[moved from modules/ocl/src/kernels/haarobjectdetect_scaled2.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_bilateral.cl	[moved from modules/ocl/src/kernels/imgproc_bilateral.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_calcHarris.cl	[moved from modules/ocl/src/kernels/imgproc_calcHarris.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl	[moved from modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_canny.cl	[moved from modules/ocl/src/kernels/imgproc_canny.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_columnsum.cl	[moved from modules/ocl/src/kernels/imgproc_columnsum.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_convolve.cl	[moved from modules/ocl/src/kernels/imgproc_convolve.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_copymakeboder.cl	[moved from modules/ocl/src/kernels/imgproc_copymakeboder.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_histogram.cl	[moved from modules/ocl/src/kernels/imgproc_histogram.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_integral.cl	[moved from modules/ocl/src/kernels/imgproc_integral.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_integral_sum.cl	[moved from modules/ocl/src/kernels/imgproc_integral_sum.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_median.cl	[moved from modules/ocl/src/kernels/imgproc_median.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_remap.cl	[moved from modules/ocl/src/kernels/imgproc_remap.cl with 98% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_resize.cl	[moved from modules/ocl/src/kernels/imgproc_resize.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_threshold.cl	[moved from modules/ocl/src/kernels/imgproc_threshold.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_warpAffine.cl	[moved from modules/ocl/src/kernels/imgproc_warpAffine.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/imgproc_warpPerspective.cl	[moved from modules/ocl/src/kernels/imgproc_warpPerspective.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/interpolate_frames.cl	[moved from modules/ocl/src/kernels/interpolate_frames.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/match_template.cl	[moved from modules/ocl/src/kernels/match_template.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/meanShift.cl	[moved from modules/ocl/src/kernels/meanShift.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/merge_mat.cl	[moved from modules/ocl/src/kernels/merge_mat.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/moments.cl	[moved from modules/ocl/src/kernels/moments.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/nonfree_surf.cl	[moved from modules/ocl/src/kernels/nonfree_surf.cl with 94% similarity]	patch \| blob \| history
modules/ocl/src/opencl/objdetect_hog.cl	[moved from modules/ocl/src/kernels/objdetect_hog.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/operator_convertTo.cl	[moved from modules/ocl/src/kernels/operator_convertTo.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/operator_copyToM.cl	[moved from modules/ocl/src/kernels/operator_copyToM.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/operator_setTo.cl	[moved from modules/ocl/src/kernels/operator_setTo.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/operator_setToM.cl	[moved from modules/ocl/src/kernels/operator_setToM.cl with 99% similarity]	patch \| blob \| history
modules/ocl/src/opencl/pyr_down.cl	[moved from modules/ocl/src/kernels/pyr_down.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/pyr_up.cl	[moved from modules/ocl/src/kernels/pyr_up.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/pyrlk.cl	[moved from modules/ocl/src/kernels/pyrlk.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/pyrlk_no_image.cl	[moved from modules/ocl/src/kernels/pyrlk_no_image.cl with 100% similarity]	patch \| blob \| history
modules/ocl/src/opencl/split_mat.cl	[moved from modules/ocl/src/kernels/split_mat.cl with 87% similarity]	patch \| blob \| history
modules/ocl/src/opencl/stereobm.cl	[moved from modules/ocl/src/kernels/stereobm.cl with 96% similarity]	patch \| blob \| history