status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO)
endif()
-if(HAVE_OPENCL AND BUILD_opencv_ocl)
+if(HAVE_OPENCL)
status("")
status(" OpenCL")
if(OPENCL_INCLUDE_DIR)
file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h")
+ file(GLOB cl_kernels "src/opencl/*.cl")
+
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
source_group("Include" FILES ${lib_hdrs})
source_group("Include\\detail" FILES ${lib_hdrs_detail})
+ if(HAVE_OPENCL AND cl_kernels)
+ ocv_include_directories(${OPENCL_INCLUDE_DIRS})
+ add_custom_command(
+ OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp"
+ COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake"
+ DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake")
+ source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
+ list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
+ endif()
+
ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs})
endmacro()
if(NOT "${ARGN}" STREQUAL "SKIP_LINK")
target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN})
+ if(HAVE_OPENCL AND OPENCL_LIBRARIES)
+ target_link_libraries(${the_module} ${OPENCL_LIBRARIES})
+ endif()
endif()
add_dependencies(opencv_modules ${the_module})
endif()
set(the_description "OpenCL-accelerated Computer Vision")
-ocv_add_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree)
-ocv_module_include_directories(${OPENCL_INCLUDE_DIRS})
-
-file(GLOB CL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl")
-set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
-set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.cmake")
-
-add_custom_command(
- OUTPUT ${kernels_cpp}
- COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" -DOUTPUT="${kernels_cpp}" -P ${cl2cpp_script}
- DEPENDS ${CL_FILES} ${cl2cpp_script})
-
-file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h")
-file(GLOB lib_srcs "src/*.cpp")
-file(GLOB lib_int_hdrs "src/*.h*")
-
-source_group("Include" FILES ${lib_hdrs})
-source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs} ${kernels_cpp})
-
+ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree)
ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow)
-
-ocv_set_module_sources(HEADERS ${lib_hdrs} SOURCES ${lib_int_hdrs} ${lib_srcs} ${kernels_cpp})
-ocv_create_module(${OPENCL_LIBRARIES})
-ocv_add_precompiled_headers(${the_module})
-
-################################################################################################################
-################################ OpenCL Module Tests ##################################################
-################################################################################################################
-file(GLOB test_srcs "test/*.cpp")
-file(GLOB test_hdrs "test/*.hpp" "test/*.h")
-
-ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
- FILES "Src" ${test_srcs})
-
-################################################################################################################
-################################ OpenCL Module Performance ##################################################
-################################################################################################################
-file(GLOB perf_srcs "perf/*.cpp")
-file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h")
-
-ocv_add_perf_tests(FILES "Include" ${perf_hdrs}
- FILES "Src" ${perf_srcs})
+++ /dev/null
-#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
-#define MAX_FLOAT 1e7f
-
-int bit1Count(float x)
-{
- int c = 0;
- int ix = (int)x;
-
- for (int i = 0 ; i < 32 ; i++)
- {
- c += ix & 0x1;
- ix >>= 1;
- }
-
- return (float)c;
-}
-/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
-local size: dim0 is block_size, dim1 is block_size.
-*/
-__kernel void BruteForceMatch_UnrollMatch(
- __global float *query,
- __global float *train,
- //__global float *mask,
- __global int *bestTrainIdx,
- __global float *bestDistance,
- __local float *sharebuffer,
- int block_size,
- int max_desc_len,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step,
- int distType
-)
-{
- const int lidx = get_local_id(0);
- const int lidy = get_local_id(1);
- const int groupidx = get_group_id(0);
-
- __local float *s_query = sharebuffer;
- __local float *s_train = sharebuffer + block_size * max_desc_len;
-
- int queryIdx = groupidx * block_size + lidy;
-
- // load the query into local memory.
- for (int i = 0 ; i < max_desc_len / block_size; i ++)
- {
- int loadx = lidx + i * block_size;
- s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
- }
-
- float myBestDistance = MAX_FLOAT;
- int myBestTrainIdx = -1;
-
- // loopUnrolledCached to find the best trainIdx and best distance.
- volatile int imgIdx = 0;
-
- for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
- {
- float result = 0;
-
- for (int i = 0 ; i < max_desc_len / block_size ; i++)
- {
- //load a block_size * block_size block into local train.
- const int loadx = lidx + i * block_size;
- s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
-
- //synchronize to make sure each elem for reduceIteration in share memory is written already.
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
- sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
- switch (distType)
- {
- case 0:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
- }
-
- break;
- case 1:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
- result += qr * qr;
- }
-
- break;
- case 2:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
- result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
- }
-
- break;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- int trainIdx = t * block_size + lidx;
-
- if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
- {
- //bestImgIdx = imgIdx;
- myBestDistance = result;
- myBestTrainIdx = trainIdx;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- __local float *s_distance = (__local float *)(sharebuffer);
- __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
-
- //find BestMatch
- s_distance += lidy * block_size;
- s_trainIdx += lidy * block_size;
- s_distance[lidx] = myBestDistance;
- s_trainIdx[lidx] = myBestTrainIdx;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //reduce -- now all reduce implement in each threads.
- for (int k = 0 ; k < block_size; k++)
- {
- if (myBestDistance > s_distance[k])
- {
- myBestDistance = s_distance[k];
- myBestTrainIdx = s_trainIdx[k];
- }
- }
-
- if (queryIdx < query_rows && lidx == 0)
- {
- bestTrainIdx[queryIdx] = myBestTrainIdx;
- bestDistance[queryIdx] = myBestDistance;
- }
-}
-
-__kernel void BruteForceMatch_Match(
- __global float *query,
- __global float *train,
- //__global float *mask,
- __global int *bestTrainIdx,
- __global float *bestDistance,
- __local float *sharebuffer,
- int block_size,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step,
- int distType
-)
-{
- const int lidx = get_local_id(0);
- const int lidy = get_local_id(1);
- const int groupidx = get_group_id(0);
-
- const int queryIdx = groupidx * block_size + lidy;
-
- float myBestDistance = MAX_FLOAT;
- int myBestTrainIdx = -1;
-
- __local float *s_query = sharebuffer;
- __local float *s_train = sharebuffer + block_size * block_size;
-
- // loop
- for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
- {
- //Dist dist;
- float result = 0;
-
- for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
- {
- const int loadx = lidx + i * block_size;
- //load query and train into local memory
- s_query[lidy * block_size + lidx] = 0;
- s_train[lidx * block_size + lidy] = 0;
-
- if (loadx < query_cols)
- {
- s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
- s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
- sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
- switch (distType)
- {
- case 0:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
- }
-
- break;
- case 1:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
- result += qr * qr;
- }
-
- break;
- case 2:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
- result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
- }
-
- break;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- const int trainIdx = t * block_size + lidx;
-
- if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
- {
- //myBestImgidx = imgIdx;
- myBestDistance = result;
- myBestTrainIdx = trainIdx;
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- __local float *s_distance = (__local float *)sharebuffer;
- __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
-
- //findBestMatch
- s_distance += lidy * block_size;
- s_trainIdx += lidy * block_size;
- s_distance[lidx] = myBestDistance;
- s_trainIdx[lidx] = myBestTrainIdx;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- //reduce -- now all reduce implement in each threads.
- for (int k = 0 ; k < block_size; k++)
- {
- if (myBestDistance > s_distance[k])
- {
- myBestDistance = s_distance[k];
- myBestTrainIdx = s_trainIdx[k];
- }
- }
-
- if (queryIdx < query_rows && lidx == 0)
- {
- bestTrainIdx[queryIdx] = myBestTrainIdx;
- bestDistance[queryIdx] = myBestDistance;
- }
-}
-
-//radius_unrollmatch
-__kernel void BruteForceMatch_RadiusUnrollMatch(
- __global float *query,
- __global float *train,
- float maxDistance,
- //__global float *mask,
- __global int *bestTrainIdx,
- __global float *bestDistance,
- __global int *nMatches,
- __local float *sharebuffer,
- int block_size,
- int max_desc_len,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int bestTrainIdx_cols,
- int step,
- int ostep,
- int distType
-)
-{
- const int lidx = get_local_id(0);
- const int lidy = get_local_id(1);
- const int groupidx = get_group_id(0);
- const int groupidy = get_group_id(1);
-
- const int queryIdx = groupidy * block_size + lidy;
- const int trainIdx = groupidx * block_size + lidx;
-
- __local float *s_query = sharebuffer;
- __local float *s_train = sharebuffer + block_size * block_size;
-
- float result = 0;
-
- for (int i = 0 ; i < max_desc_len / block_size ; ++i)
- {
- //load a block_size * block_size block into local train.
- const int loadx = lidx + i * block_size;
-
- s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
- s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
-
- //synchronize to make sure each elem for reduceIteration in share memory is written already.
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
- sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
- switch (distType)
- {
- case 0:
-
- for (int j = 0 ; j < block_size ; ++j)
- {
- result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
- }
-
- break;
- case 1:
-
- for (int j = 0 ; j < block_size ; ++j)
- {
- float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
- result += qr * qr;
- }
-
- break;
- case 2:
-
- for (int j = 0 ; j < block_size ; ++j)
- {
- result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
- }
-
- break;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
- {
- unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
-
- if (ind < bestTrainIdx_cols)
- {
- //bestImgIdx = imgIdx;
- bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
- bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
- }
- }
-}
-
-//radius_match
-__kernel void BruteForceMatch_RadiusMatch(
- __global float *query,
- __global float *train,
- float maxDistance,
- //__global float *mask,
- __global int *bestTrainIdx,
- __global float *bestDistance,
- __global int *nMatches,
- __local float *sharebuffer,
- int block_size,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int bestTrainIdx_cols,
- int step,
- int ostep,
- int distType
-)
-{
- const int lidx = get_local_id(0);
- const int lidy = get_local_id(1);
- const int groupidx = get_group_id(0);
- const int groupidy = get_group_id(1);
-
- const int queryIdx = groupidy * block_size + lidy;
- const int trainIdx = groupidx * block_size + lidx;
-
- __local float *s_query = sharebuffer;
- __local float *s_train = sharebuffer + block_size * block_size;
-
- float result = 0;
-
- for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
- {
- //load a block_size * block_size block into local train.
- const int loadx = lidx + i * block_size;
-
- s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
- s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
-
- //synchronize to make sure each elem for reduceIteration in share memory is written already.
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
- sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
- switch (distType)
- {
- case 0:
-
- for (int j = 0 ; j < block_size ; ++j)
- {
- result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
- }
-
- break;
- case 1:
-
- for (int j = 0 ; j < block_size ; ++j)
- {
- float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
- result += qr * qr;
- }
-
- break;
- case 2:
-
- for (int j = 0 ; j < block_size ; ++j)
- {
- result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
- }
-
- break;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
- {
- unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
-
- if (ind < bestTrainIdx_cols)
- {
- //bestImgIdx = imgIdx;
- bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
- bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
- }
- }
-}
-
-
-__kernel void BruteForceMatch_knnUnrollMatch(
- __global float *query,
- __global float *train,
- //__global float *mask,
- __global int2 *bestTrainIdx,
- __global float2 *bestDistance,
- __local float *sharebuffer,
- int block_size,
- int max_desc_len,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step,
- int distType
-)
-{
- const int lidx = get_local_id(0);
- const int lidy = get_local_id(1);
- const int groupidx = get_group_id(0);
-
- const int queryIdx = groupidx * block_size + lidy;
- local float *s_query = sharebuffer;
- local float *s_train = sharebuffer + block_size * max_desc_len;
-
- // load the query into local memory.
- for (int i = 0 ; i < max_desc_len / block_size; i ++)
- {
- int loadx = lidx + i * block_size;
- s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
- }
-
- float myBestDistance1 = MAX_FLOAT;
- float myBestDistance2 = MAX_FLOAT;
- int myBestTrainIdx1 = -1;
- int myBestTrainIdx2 = -1;
-
- //loopUnrolledCached
- volatile int imgIdx = 0;
-
- for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
- {
- float result = 0;
-
- for (int i = 0 ; i < max_desc_len / block_size ; i++)
- {
- const int loadX = lidx + i * block_size;
- //load a block_size * block_size block into local train.
- const int loadx = lidx + i * block_size;
- s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
-
- //synchronize to make sure each elem for reduceIteration in share memory is written already.
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
- sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
- switch (distType)
- {
- case 0:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
- }
-
- break;
- case 1:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
- result += qr * qr;
- }
-
- break;
- case 2:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
- result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
- }
-
- break;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- const int trainIdx = t * block_size + lidx;
-
- if (queryIdx < query_rows && trainIdx < train_rows)
- {
- if (result < myBestDistance1)
- {
- myBestDistance2 = myBestDistance1;
- myBestTrainIdx2 = myBestTrainIdx1;
- myBestDistance1 = result;
- myBestTrainIdx1 = trainIdx;
- }
- else if (result < myBestDistance2)
- {
- myBestDistance2 = result;
- myBestTrainIdx2 = trainIdx;
- }
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- local float *s_distance = (local float *)sharebuffer;
- local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
-
- // find BestMatch
- s_distance += lidy * block_size;
- s_trainIdx += lidy * block_size;
-
- s_distance[lidx] = myBestDistance1;
- s_trainIdx[lidx] = myBestTrainIdx1;
-
- float bestDistance1 = MAX_FLOAT;
- float bestDistance2 = MAX_FLOAT;
- int bestTrainIdx1 = -1;
- int bestTrainIdx2 = -1;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lidx == 0)
- {
- for (int i = 0 ; i < block_size ; i++)
- {
- float val = s_distance[i];
-
- if (val < bestDistance1)
- {
- bestDistance2 = bestDistance1;
- bestTrainIdx2 = bestTrainIdx1;
-
- bestDistance1 = val;
- bestTrainIdx1 = s_trainIdx[i];
- }
- else if (val < bestDistance2)
- {
- bestDistance2 = val;
- bestTrainIdx2 = s_trainIdx[i];
- }
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- s_distance[lidx] = myBestDistance2;
- s_trainIdx[lidx] = myBestTrainIdx2;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lidx == 0)
- {
- for (int i = 0 ; i < block_size ; i++)
- {
- float val = s_distance[i];
-
- if (val < bestDistance2)
- {
- bestDistance2 = val;
- bestTrainIdx2 = s_trainIdx[i];
- }
- }
- }
-
- myBestDistance1 = bestDistance1;
- myBestDistance2 = bestDistance2;
-
- myBestTrainIdx1 = bestTrainIdx1;
- myBestTrainIdx2 = bestTrainIdx2;
-
- if (queryIdx < query_rows && lidx == 0)
- {
- bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
- bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
- }
-}
-
-__kernel void BruteForceMatch_knnMatch(
- __global float *query,
- __global float *train,
- //__global float *mask,
- __global int2 *bestTrainIdx,
- __global float2 *bestDistance,
- __local float *sharebuffer,
- int block_size,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step,
- int distType
-)
-{
- const int lidx = get_local_id(0);
- const int lidy = get_local_id(1);
- const int groupidx = get_group_id(0);
-
- const int queryIdx = groupidx * block_size + lidy;
- local float *s_query = sharebuffer;
- local float *s_train = sharebuffer + block_size * block_size;
-
- float myBestDistance1 = MAX_FLOAT;
- float myBestDistance2 = MAX_FLOAT;
- int myBestTrainIdx1 = -1;
- int myBestTrainIdx2 = -1;
-
- //loop
- for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
- {
- float result = 0.0f;
-
- for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
- {
- const int loadx = lidx + i * block_size;
- //load query and train into local memory
- s_query[lidy * block_size + lidx] = 0;
- s_train[lidx * block_size + lidy] = 0;
-
- if (loadx < query_cols)
- {
- s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
- s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
- sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
-
- switch (distType)
- {
- case 0:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
- }
-
- break;
- case 1:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
- result += qr * qr;
- }
-
- break;
- case 2:
-
- for (int j = 0 ; j < block_size ; j++)
- {
- //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
- result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
- }
-
- break;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
- }
-
- const int trainIdx = t * block_size + lidx;
-
- if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
- {
- if (result < myBestDistance1)
- {
- myBestDistance2 = myBestDistance1;
- myBestTrainIdx2 = myBestTrainIdx1;
- myBestDistance1 = result;
- myBestTrainIdx1 = trainIdx;
- }
- else if (result < myBestDistance2)
- {
- myBestDistance2 = result;
- myBestTrainIdx2 = trainIdx;
- }
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- __local float *s_distance = (__local float *)sharebuffer;
- __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
-
- //findBestMatch
- s_distance += lidy * block_size;
- s_trainIdx += lidy * block_size;
-
- s_distance[lidx] = myBestDistance1;
- s_trainIdx[lidx] = myBestTrainIdx1;
-
- float bestDistance1 = MAX_FLOAT;
- float bestDistance2 = MAX_FLOAT;
- int bestTrainIdx1 = -1;
- int bestTrainIdx2 = -1;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lidx == 0)
- {
- for (int i = 0 ; i < block_size ; i++)
- {
- float val = s_distance[i];
-
- if (val < bestDistance1)
- {
- bestDistance2 = bestDistance1;
- bestTrainIdx2 = bestTrainIdx1;
-
- bestDistance1 = val;
- bestTrainIdx1 = s_trainIdx[i];
- }
- else if (val < bestDistance2)
- {
- bestDistance2 = val;
- bestTrainIdx2 = s_trainIdx[i];
- }
- }
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- s_distance[lidx] = myBestDistance2;
- s_trainIdx[lidx] = myBestTrainIdx2;
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (lidx == 0)
- {
- for (int i = 0 ; i < block_size ; i++)
- {
- float val = s_distance[i];
-
- if (val < bestDistance2)
- {
- bestDistance2 = val;
- bestTrainIdx2 = s_trainIdx[i];
- }
- }
- }
-
- myBestDistance1 = bestDistance1;
- myBestDistance2 = bestDistance2;
-
- myBestTrainIdx1 = bestTrainIdx1;
- myBestTrainIdx2 = bestTrainIdx2;
-
- if (queryIdx < query_rows && lidx == 0)
- {
- bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
- bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
- }
-}
-
-kernel void BruteForceMatch_calcDistanceUnrolled(
- __global float *query,
- __global float *train,
- //__global float *mask,
- __global float *allDist,
- __local float *sharebuffer,
- int block_size,
- int max_desc_len,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step,
- int distType)
-{
- /* Todo */
-}
-
-kernel void BruteForceMatch_calcDistance(
- __global float *query,
- __global float *train,
- //__global float *mask,
- __global float *allDist,
- __local float *sharebuffer,
- int block_size,
- int query_rows,
- int query_cols,
- int train_rows,
- int train_cols,
- int step,
- int distType)
-{
- /* Todo */
-}
-
-kernel void BruteForceMatch_findBestMatch(
- __global float *allDist,
- __global int *bestTrainIdx,
- __global float *bestDistance,
- int k,
- int block_size
-)
-{
- /* Todo */
-}
\ No newline at end of file
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- uchar4 src1_data ,src2_data;
+ uchar4 src1_data ,src2_data;
- src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
- src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
- src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
- src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
+ src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
+ src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
+ src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
+ src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
- src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
- src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
- src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
- src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
+ src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
+ src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
+ src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
+ src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
#define dst_align ((dst_offset >> bitOfInt) & 3)
- int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
- int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
-
+ int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
+ int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-
+
if(src1_index < 0)
{
int4 tmp;
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
-
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
- {
+ {
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- uchar4 tmp;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
- {
+ {
char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- char4 tmp;
+ }
+ if(src2_index < 0)
+ {
+ char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
- {
+ {
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- ushort4 tmp;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
- {
+ {
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- short4 tmp;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
}
}
#endif
-
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = ~ src1_data;
-
+
/* if(src1_index < 0)
{
uchar4 tmp;
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
{
int src_index = mad24(y, src_step, (x << 3) + src_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
-
+
char8 data;
data = *((__global char8 *)((__global char *)src + src_index));
data = ~ data;
-
+
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
}
#endif
-
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
}
}
#endif
-
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
- {
+ {
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- uchar4 tmp;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
- {
+ {
char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- char4 tmp;
+ }
+ if(src2_index < 0)
+ {
+ char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
- {
+ {
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- ushort4 tmp;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
if(src1_index < 0)
- {
+ {
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- short4 tmp;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
}
}
#endif
-
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
- if(src1_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
-
-
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
-
-
-
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
int y = get_global_id(1);
if (x < cols && y < rows)
- {
+ {
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
- if(src1_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
-
-
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
-
-
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
-
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
- if(src1_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
-
- float4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
+ if(src1_index < 0)
+ {
+
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 3)& 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
}
}
#endif
-
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
- if(src1_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
}
#endif
-
+
/***********************************Compare LT*******************************/
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
- if(src1_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
-
-
-
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
+
+
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
x = x << 2;
#define dst_align (dst_offset & 3)
- int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
- int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+ int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+ int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
- if(src1_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- uchar4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ if(src1_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ uchar4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
- ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
- int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
- int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
+ int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
+ int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
- short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- short4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
+ if(src1_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
- if(src1_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- int4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
+ if(src1_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ int4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
- float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- float4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
+ float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ float4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
{
x = x << 2;
#define dst_align ((dst_offset >> 3)& 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
- int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+ int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
- int src1_index_fix = src1_index < 0 ? 0 : src1_index;
- int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+ int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+ int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
- double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
- if(src1_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
- src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
- }
- if(src2_index < 0)
- {
- double4 tmp;
- tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
- src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
- }
-
+ double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
+ if(src1_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+ src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+ }
+ if(src2_index < 0)
+ {
+ double4 tmp;
+ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+ src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+ }
+
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
}
}
#endif
-
-
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
- int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
-
+ int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
+ int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
int y = get_global_id(1);
if (x < cols && y < rows)
-
+
{
-
+
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
- int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
-
+ int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
+
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
src1_data.s01234567 = src1_data.s45670123;
if(src1_index== -2)
src1_data.s01234567 = src1_data.s23456701;
-
-
+
+
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]);
}
}
-
dst[gid + groupnum] = localmem_max[0];
}
}
-
dst[gid] = localmem_sum[0];
}
}
-
dst[gid*3+2] = localmem_sum3[0];
}
}
-
// Third party copyrights are property of their respective owners.
//
// @Authors
-// Liu Liujun, liujun@multicorewareinc.com
+// Liu Liujun, liujun@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
int pos = mad24(idy,istep >> 2,idx);
int wpos = mad24(idy,wstep >> 2,idx);
float4 w1 = weight1[wpos], w2 = weight2[wpos];
- dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
+ dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
}
}
int wpos = mad24(idy,wstep, idx);
float w1 = weight1[wpos];
float w2 = weight2[wpos];
- dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
+ dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 +
convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f));
}
}
dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f);
}
}
-
--- /dev/null
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable
+#define MAX_FLOAT 1e7f
+
+int bit1Count(float x)
+{
+ int c = 0;
+ int ix = (int)x;
+
+ for (int i = 0 ; i < 32 ; i++)
+ {
+ c += ix & 0x1;
+ ix >>= 1;
+ }
+
+ return (float)c;
+}
+/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size
+local size: dim0 is block_size, dim1 is block_size.
+*/
+__kernel void BruteForceMatch_UnrollMatch(
+ __global float *query,
+ __global float *train,
+ //__global float *mask,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ __local float *sharebuffer,
+ int block_size,
+ int max_desc_len,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step,
+ int distType
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+
+ __local float *s_query = sharebuffer;
+ __local float *s_train = sharebuffer + block_size * max_desc_len;
+
+ int queryIdx = groupidx * block_size + lidy;
+
+ // load the query into local memory.
+ for (int i = 0 ; i < max_desc_len / block_size; i ++)
+ {
+ int loadx = lidx + i * block_size;
+ s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ }
+
+ float myBestDistance = MAX_FLOAT;
+ int myBestTrainIdx = -1;
+
+ // loopUnrolledCached to find the best trainIdx and best distance.
+ volatile int imgIdx = 0;
+
+ for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+ {
+ float result = 0;
+
+ for (int i = 0 ; i < max_desc_len / block_size ; i++)
+ {
+ //load a block_size * block_size block into local train.
+ const int loadx = lidx + i * block_size;
+ s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+ //synchronize to make sure each elem for reduceIteration in share memory is written already.
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+ switch (distType)
+ {
+ case 0:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
+ }
+
+ break;
+ case 1:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
+ result += qr * qr;
+ }
+
+ break;
+ case 2:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+ result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+ }
+
+ break;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ int trainIdx = t * block_size + lidx;
+
+ if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/)
+ {
+ //bestImgIdx = imgIdx;
+ myBestDistance = result;
+ myBestTrainIdx = trainIdx;
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ __local float *s_distance = (__local float *)(sharebuffer);
+ __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+
+ //find BestMatch
+ s_distance += lidy * block_size;
+ s_trainIdx += lidy * block_size;
+ s_distance[lidx] = myBestDistance;
+ s_trainIdx[lidx] = myBestTrainIdx;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //reduce -- now all reduce implement in each threads.
+ for (int k = 0 ; k < block_size; k++)
+ {
+ if (myBestDistance > s_distance[k])
+ {
+ myBestDistance = s_distance[k];
+ myBestTrainIdx = s_trainIdx[k];
+ }
+ }
+
+ if (queryIdx < query_rows && lidx == 0)
+ {
+ bestTrainIdx[queryIdx] = myBestTrainIdx;
+ bestDistance[queryIdx] = myBestDistance;
+ }
+}
+
+__kernel void BruteForceMatch_Match(
+ __global float *query,
+ __global float *train,
+ //__global float *mask,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ __local float *sharebuffer,
+ int block_size,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step,
+ int distType
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+
+ const int queryIdx = groupidx * block_size + lidy;
+
+ float myBestDistance = MAX_FLOAT;
+ int myBestTrainIdx = -1;
+
+ __local float *s_query = sharebuffer;
+ __local float *s_train = sharebuffer + block_size * block_size;
+
+ // loop
+ for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+ {
+ //Dist dist;
+ float result = 0;
+
+ for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
+ {
+ const int loadx = lidx + i * block_size;
+ //load query and train into local memory
+ s_query[lidy * block_size + lidx] = 0;
+ s_train[lidx * block_size + lidy] = 0;
+
+ if (loadx < query_cols)
+ {
+ s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+ s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+ switch (distType)
+ {
+ case 0:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+ }
+
+ break;
+ case 1:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+ result += qr * qr;
+ }
+
+ break;
+ case 2:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+ result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+ }
+
+ break;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ const int trainIdx = t * block_size + lidx;
+
+ if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/)
+ {
+ //myBestImgidx = imgIdx;
+ myBestDistance = result;
+ myBestTrainIdx = trainIdx;
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ __local float *s_distance = (__local float *)sharebuffer;
+ __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+
+ //findBestMatch
+ s_distance += lidy * block_size;
+ s_trainIdx += lidy * block_size;
+ s_distance[lidx] = myBestDistance;
+ s_trainIdx[lidx] = myBestTrainIdx;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ //reduce -- now all reduce implement in each threads.
+ for (int k = 0 ; k < block_size; k++)
+ {
+ if (myBestDistance > s_distance[k])
+ {
+ myBestDistance = s_distance[k];
+ myBestTrainIdx = s_trainIdx[k];
+ }
+ }
+
+ if (queryIdx < query_rows && lidx == 0)
+ {
+ bestTrainIdx[queryIdx] = myBestTrainIdx;
+ bestDistance[queryIdx] = myBestDistance;
+ }
+}
+
+//radius_unrollmatch
+__kernel void BruteForceMatch_RadiusUnrollMatch(
+ __global float *query,
+ __global float *train,
+ float maxDistance,
+ //__global float *mask,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ __global int *nMatches,
+ __local float *sharebuffer,
+ int block_size,
+ int max_desc_len,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int bestTrainIdx_cols,
+ int step,
+ int ostep,
+ int distType
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+ const int groupidy = get_group_id(1);
+
+ const int queryIdx = groupidy * block_size + lidy;
+ const int trainIdx = groupidx * block_size + lidx;
+
+ __local float *s_query = sharebuffer;
+ __local float *s_train = sharebuffer + block_size * block_size;
+
+ float result = 0;
+
+ for (int i = 0 ; i < max_desc_len / block_size ; ++i)
+ {
+ //load a block_size * block_size block into local train.
+ const int loadx = lidx + i * block_size;
+
+ s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+ //synchronize to make sure each elem for reduceIteration in share memory is written already.
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+ switch (distType)
+ {
+ case 0:
+
+ for (int j = 0 ; j < block_size ; ++j)
+ {
+ result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+ }
+
+ break;
+ case 1:
+
+ for (int j = 0 ; j < block_size ; ++j)
+ {
+ float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+ result += qr * qr;
+ }
+
+ break;
+ case 2:
+
+ for (int j = 0 ; j < block_size ; ++j)
+ {
+ result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+ }
+
+ break;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
+ {
+ unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+
+ if (ind < bestTrainIdx_cols)
+ {
+ //bestImgIdx = imgIdx;
+ bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
+ bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+ }
+ }
+}
+
+//radius_match
+__kernel void BruteForceMatch_RadiusMatch(
+ __global float *query,
+ __global float *train,
+ float maxDistance,
+ //__global float *mask,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ __global int *nMatches,
+ __local float *sharebuffer,
+ int block_size,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int bestTrainIdx_cols,
+ int step,
+ int ostep,
+ int distType
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+ const int groupidy = get_group_id(1);
+
+ const int queryIdx = groupidy * block_size + lidy;
+ const int trainIdx = groupidx * block_size + lidx;
+
+ __local float *s_query = sharebuffer;
+ __local float *s_train = sharebuffer + block_size * block_size;
+
+ float result = 0;
+
+ for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i)
+ {
+ //load a block_size * block_size block into local train.
+ const int loadx = lidx + i * block_size;
+
+ s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+ //synchronize to make sure each elem for reduceIteration in share memory is written already.
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+ switch (distType)
+ {
+ case 0:
+
+ for (int j = 0 ; j < block_size ; ++j)
+ {
+ result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+ }
+
+ break;
+ case 1:
+
+ for (int j = 0 ; j < block_size ; ++j)
+ {
+ float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+ result += qr * qr;
+ }
+
+ break;
+ case 2:
+
+ for (int j = 0 ; j < block_size ; ++j)
+ {
+ result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+ }
+
+ break;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/)
+ {
+ unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/);
+
+ if (ind < bestTrainIdx_cols)
+ {
+ //bestImgIdx = imgIdx;
+ bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx;
+ bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result;
+ }
+ }
+}
+
+
+__kernel void BruteForceMatch_knnUnrollMatch(
+ __global float *query,
+ __global float *train,
+ //__global float *mask,
+ __global int2 *bestTrainIdx,
+ __global float2 *bestDistance,
+ __local float *sharebuffer,
+ int block_size,
+ int max_desc_len,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step,
+ int distType
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+
+ const int queryIdx = groupidx * block_size + lidy;
+ local float *s_query = sharebuffer;
+ local float *s_train = sharebuffer + block_size * max_desc_len;
+
+ // load the query into local memory.
+ for (int i = 0 ; i < max_desc_len / block_size; i ++)
+ {
+ int loadx = lidx + i * block_size;
+ s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+ }
+
+ float myBestDistance1 = MAX_FLOAT;
+ float myBestDistance2 = MAX_FLOAT;
+ int myBestTrainIdx1 = -1;
+ int myBestTrainIdx2 = -1;
+
+ //loopUnrolledCached
+ volatile int imgIdx = 0;
+
+ for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+ {
+ float result = 0;
+
+ for (int i = 0 ; i < max_desc_len / block_size ; i++)
+ {
+ const int loadX = lidx + i * block_size;
+ //load a block_size * block_size block into local train.
+ const int loadx = lidx + i * block_size;
+ s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0;
+
+ //synchronize to make sure each elem for reduceIteration in share memory is written already.
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+ switch (distType)
+ {
+ case 0:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]);
+ }
+
+ break;
+ case 1:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx];
+ result += qr * qr;
+ }
+
+ break;
+ case 2:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+ result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]);
+ }
+
+ break;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ const int trainIdx = t * block_size + lidx;
+
+ if (queryIdx < query_rows && trainIdx < train_rows)
+ {
+ if (result < myBestDistance1)
+ {
+ myBestDistance2 = myBestDistance1;
+ myBestTrainIdx2 = myBestTrainIdx1;
+ myBestDistance1 = result;
+ myBestTrainIdx1 = trainIdx;
+ }
+ else if (result < myBestDistance2)
+ {
+ myBestDistance2 = result;
+ myBestTrainIdx2 = trainIdx;
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ local float *s_distance = (local float *)sharebuffer;
+ local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size);
+
+ // find BestMatch
+ s_distance += lidy * block_size;
+ s_trainIdx += lidy * block_size;
+
+ s_distance[lidx] = myBestDistance1;
+ s_trainIdx[lidx] = myBestTrainIdx1;
+
+ float bestDistance1 = MAX_FLOAT;
+ float bestDistance2 = MAX_FLOAT;
+ int bestTrainIdx1 = -1;
+ int bestTrainIdx2 = -1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (lidx == 0)
+ {
+ for (int i = 0 ; i < block_size ; i++)
+ {
+ float val = s_distance[i];
+
+ if (val < bestDistance1)
+ {
+ bestDistance2 = bestDistance1;
+ bestTrainIdx2 = bestTrainIdx1;
+
+ bestDistance1 = val;
+ bestTrainIdx1 = s_trainIdx[i];
+ }
+ else if (val < bestDistance2)
+ {
+ bestDistance2 = val;
+ bestTrainIdx2 = s_trainIdx[i];
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ s_distance[lidx] = myBestDistance2;
+ s_trainIdx[lidx] = myBestTrainIdx2;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (lidx == 0)
+ {
+ for (int i = 0 ; i < block_size ; i++)
+ {
+ float val = s_distance[i];
+
+ if (val < bestDistance2)
+ {
+ bestDistance2 = val;
+ bestTrainIdx2 = s_trainIdx[i];
+ }
+ }
+ }
+
+ myBestDistance1 = bestDistance1;
+ myBestDistance2 = bestDistance2;
+
+ myBestTrainIdx1 = bestTrainIdx1;
+ myBestTrainIdx2 = bestTrainIdx2;
+
+ if (queryIdx < query_rows && lidx == 0)
+ {
+ bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
+ bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
+ }
+}
+
+__kernel void BruteForceMatch_knnMatch(
+ __global float *query,
+ __global float *train,
+ //__global float *mask,
+ __global int2 *bestTrainIdx,
+ __global float2 *bestDistance,
+ __local float *sharebuffer,
+ int block_size,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step,
+ int distType
+)
+{
+ const int lidx = get_local_id(0);
+ const int lidy = get_local_id(1);
+ const int groupidx = get_group_id(0);
+
+ const int queryIdx = groupidx * block_size + lidy;
+ local float *s_query = sharebuffer;
+ local float *s_train = sharebuffer + block_size * block_size;
+
+ float myBestDistance1 = MAX_FLOAT;
+ float myBestDistance2 = MAX_FLOAT;
+ int myBestTrainIdx1 = -1;
+ int myBestTrainIdx2 = -1;
+
+ //loop
+ for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++)
+ {
+ float result = 0.0f;
+
+ for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++)
+ {
+ const int loadx = lidx + i * block_size;
+ //load query and train into local memory
+ s_query[lidy * block_size + lidx] = 0;
+ s_train[lidx * block_size + lidy] = 0;
+
+ if (loadx < query_cols)
+ {
+ s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx];
+ s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx];
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to
+ sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/
+
+ switch (distType)
+ {
+ case 0:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]);
+ }
+
+ break;
+ case 1:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx];
+ result += qr * qr;
+ }
+
+ break;
+ case 2:
+
+ for (int j = 0 ; j < block_size ; j++)
+ {
+ //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]);
+ result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]);
+ }
+
+ break;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ const int trainIdx = t * block_size + lidx;
+
+ if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/)
+ {
+ if (result < myBestDistance1)
+ {
+ myBestDistance2 = myBestDistance1;
+ myBestTrainIdx2 = myBestTrainIdx1;
+ myBestDistance1 = result;
+ myBestTrainIdx1 = trainIdx;
+ }
+ else if (result < myBestDistance2)
+ {
+ myBestDistance2 = result;
+ myBestTrainIdx2 = trainIdx;
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ __local float *s_distance = (__local float *)sharebuffer;
+ __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size);
+
+ //findBestMatch
+ s_distance += lidy * block_size;
+ s_trainIdx += lidy * block_size;
+
+ s_distance[lidx] = myBestDistance1;
+ s_trainIdx[lidx] = myBestTrainIdx1;
+
+ float bestDistance1 = MAX_FLOAT;
+ float bestDistance2 = MAX_FLOAT;
+ int bestTrainIdx1 = -1;
+ int bestTrainIdx2 = -1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (lidx == 0)
+ {
+ for (int i = 0 ; i < block_size ; i++)
+ {
+ float val = s_distance[i];
+
+ if (val < bestDistance1)
+ {
+ bestDistance2 = bestDistance1;
+ bestTrainIdx2 = bestTrainIdx1;
+
+ bestDistance1 = val;
+ bestTrainIdx1 = s_trainIdx[i];
+ }
+ else if (val < bestDistance2)
+ {
+ bestDistance2 = val;
+ bestTrainIdx2 = s_trainIdx[i];
+ }
+ }
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ s_distance[lidx] = myBestDistance2;
+ s_trainIdx[lidx] = myBestTrainIdx2;
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (lidx == 0)
+ {
+ for (int i = 0 ; i < block_size ; i++)
+ {
+ float val = s_distance[i];
+
+ if (val < bestDistance2)
+ {
+ bestDistance2 = val;
+ bestTrainIdx2 = s_trainIdx[i];
+ }
+ }
+ }
+
+ myBestDistance1 = bestDistance1;
+ myBestDistance2 = bestDistance2;
+
+ myBestTrainIdx1 = bestTrainIdx1;
+ myBestTrainIdx2 = bestTrainIdx2;
+
+ if (queryIdx < query_rows && lidx == 0)
+ {
+ bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);
+ bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);
+ }
+}
+
+kernel void BruteForceMatch_calcDistanceUnrolled(
+ __global float *query,
+ __global float *train,
+ //__global float *mask,
+ __global float *allDist,
+ __local float *sharebuffer,
+ int block_size,
+ int max_desc_len,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step,
+ int distType)
+{
+ /* Todo */
+}
+
+kernel void BruteForceMatch_calcDistance(
+ __global float *query,
+ __global float *train,
+ //__global float *mask,
+ __global float *allDist,
+ __local float *sharebuffer,
+ int block_size,
+ int query_rows,
+ int query_cols,
+ int train_rows,
+ int train_cols,
+ int step,
+ int distType)
+{
+ /* Todo */
+}
+
+kernel void BruteForceMatch_findBestMatch(
+ __global float *allDist,
+ __global int *bestTrainIdx,
+ __global float *bestDistance,
+ int k,
+ int block_size
+)
+{
+ /* Todo */
+}
\ No newline at end of file
map_y[y * step_y + x] = ycoo;
}
}
-
dst[start_addr] = sum;
}
}
-
-
newnode[counter].alpha[0] = t1.alpha[0];
newnode[counter].alpha[1] = t1.alpha[1];
}
-
dst[gy*(dst_step >> 2)+gx] = res;
}
}
-
-
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12;
}
#undef op(a,b)
-
#if defined DOUBLE_SUPPORT
#pragma OPENCL EXTENSION cl_khr_fp64:enable
typedef double4 F4 ;
-#else
+#else
typedef float4 F4;
#endif
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
x = x << 2;
map1_data = *((__global short8 *)((__global char*)map1 + map1Start));
int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset;
-
+
uchar4 con = convert_uchar4(convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows) || convert_int4(map1_data.even) < (int4)(0) || convert_int4(map1_data.odd) < (int4)(0));
uchar4 src_data = val;
src_data.s2 = *(src + srcIdx.s2);
if (con.s3 == 0)
src_data.s3 = *(src + srcIdx.s3);
-
+
uchar4 dst_data;
-
+
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *d;
+ uchar4 dVal = *d;
int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
x = x << 2;
map1_data = *((__global float8 *)((__global char*)map1 + map1Start));
int8 map1_dataZ = convert_int8_sat_rte(map1_data);
int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset;
-
+
uchar4 src_data = val;
- uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0));
+ uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0));
if (con.s0 == 0)
src_data.s0 = *(src + srcIdx.s0);
// dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *d;
+ uchar4 dVal = *d;
int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-
+
dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
*d = dst_data;
}
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
x = x << 2;
float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3);
int8 map_dataZ = convert_int8_sat_rte(map_data);
int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset;
-
+
uchar4 src_data = val;
- uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0));
+ uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0));
if (con.s0 == 0)
src_data.s0 = *(src + srcIdx.s0);
if (con.s3 == 0)
src_data.s3 = *(src + srcIdx.s3);
uchar4 dst_data;
-
+
// dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data;
__global uchar4* d = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *d;
+ uchar4 dVal = *d;
int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
-
+
dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal;
*d = dst_data;
}
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
- {
+ {
int dstIdx = y * dst_step + (x << 2) + dst_offset;
int mapIdx = y * map1_step + (x << 2) + map1_offset;
float map1_data = *((__global float *)((__global char*)map1 + mapIdx));
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
int dstIdx = y * dst_step + (x << 2) + dst_offset;
src_data = *((__global float *)((__global uchar *)src + srcIdx));
*((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
-
+
}
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
int dstIdx = y * dst_step + (x << 2) + dst_offset;
src_data = *((__global float *)((__global uchar *)src + srcIdx));
*((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
-
+
}
}
{
int x = get_global_id(0);
int y = get_global_id(1);
-
+
if(x < threadCols && y < dst_rows)
{
int dstIdx = y * dst_step + (x << 2) + dst_offset;
src_data = *((__global float *)((__global uchar *)src + srcIdx));
*((__global float *)((__global uchar*)dst + dstIdx)) = src_data;
-
+
}
}
src_data = nval;
else
src_data = *((__global float4 *)((__global uchar *)src + srcIdx));
- *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data;
+ *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data;
+
-
}
}
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 2;
+ x = x << 2;
int gx = x - (dst_offset&3);
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal);
uchar4 val = (uchar4)(nval.s0);
-
+
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
-
+
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-
+
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *D;
+ uchar4 dVal = *D;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 2;
+ x = x << 2;
int gx = x - (dst_offset&3);
int4 Gx = (int4)(gx, gx+1, gx+2, gx+3);
uchar4 nval =convert_uchar4(nVal);
uchar4 val = (uchar4)(nval.s0);
-
+
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3);
d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset));
if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset));
-
+
uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v );
-
+
__global uchar4* D = (__global uchar4 *)(dst + dstStart);
- uchar4 dVal = *D;
+ uchar4 dVal = *D;
int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows);
dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal;
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
+ x = x << 4;
int gx = x - (dst_offset&15);
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
float4 nval =convert_float4(nVal);
float4 val = (float4)(nval.s0);
-
+
int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15);
int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1);
float8 map1_data;
d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
-
+
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-
+
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
- float4 dVal = *D;
+ float4 dVal = *D;
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
int y = get_global_id(1);
if(x < threadCols && y < dst_rows)
{
- x = x << 4;
+ x = x << 4;
int gx = x - (dst_offset&15);
int4 Gx = (int4)(gx, gx+4, gx+8, gx+12);
float4 nval =convert_float4(nVal);
float4 val = (float4)(nval.s0);
-
+
int dstStart = y * dst_step + x + dst_offset - (dst_offset & 15);
int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15);
float4 map1_data;
d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset));
if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0)
d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset));
-
-
+
+
float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ;
-
+
__global float4* D = (__global float4 *)((__global char*)dst + dstStart);
- float4 dVal = *D;
+ float4 dVal = *D;
int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows);
dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal;
else
d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
- float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
+ float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
*((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ;
}
else
d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset ));
- float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
+ float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y));
*((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ;
}
}
-
-
-
dst[dpos] = src[spos];
}
-
res[res_idx] = normAcc(num, denum);
}
}
-
// outsp[basesp] =(short2)((short)x0,(short)y0);
}
}
-
#define DST_ROW_A03 9
__kernel void icvContourMoments(int contour_total,
- __global float* reader_oclmat_data,
+ __global float* reader_oclmat_data,
__global T* dst_a,
int dst_step)
{
dxy = xi_1 * yi - xi * yi_1;
xii_1 = xi_1 + xi;
yii_1 = yi_1 + yi;
-
+
dst_step /= sizeof(T);
*( dst_a + DST_ROW_A00 * dst_step + idx) = dxy;
*( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1;
// N = 2
// for simple haar paatern
float icvCalcHaarPatternSum_2(
- IMAGE_INT32 sumTex,
- __constant float src[2][5],
- int oldSize,
- int newSize,
- int y, int x,
+ IMAGE_INT32 sumTex,
+ __constant float src[2][5],
+ int oldSize,
+ int newSize,
+ int y, int x,
int rows, int cols, int elemPerRow)
{
// N = 3
float icvCalcHaarPatternSum_3(
- IMAGE_INT32 sumTex,
- __constant float src[2][5],
- int oldSize,
- int newSize,
- int y, int x,
+ IMAGE_INT32 sumTex,
+ __constant float src[2][5],
+ int oldSize,
+ int newSize,
+ int y, int x,
int rows, int cols, int elemPerRow)
{
// N = 4
float icvCalcHaarPatternSum_4(
- IMAGE_INT32 sumTex,
- __constant float src[2][5],
- int oldSize,
- int newSize,
- int y, int x,
+ IMAGE_INT32 sumTex,
+ __constant float src[2][5],
+ int oldSize,
+ int newSize,
+ int y, int x,
int rows, int cols, int elemPerRow)
{
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
- trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
+ trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
}
}
// Non-maximal suppression to further filtering the candidates from previous step
__kernel
void icvFindMaximaInLayer_withmask(
- __global const float * det,
- __global const float * trace,
- __global int4 * maxPosBuffer,
+ __global const float * det,
+ __global const float * trace,
+ __global int4 * maxPosBuffer,
volatile __global int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
// Is this thread within the hessian buffer?
const int zoff = get_local_size(0) * get_local_size(1);
const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff;
- N9[localLin - zoff] =
- det[det_step *
+ N9[localLin - zoff] =
+ det[det_step *
(c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
- N9[localLin ] =
- det[det_step *
+ N9[localLin ] =
+ det[det_step *
(c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
- N9[localLin + zoff] =
- det[det_step *
+ N9[localLin + zoff] =
+ det[det_step *
(c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y
+ min(max(j, 0), c_img_cols - 1)]; // x
barrier(CLK_LOCAL_MEM_FENCE);
- if (i < c_layer_rows - margin
+ if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
- && get_local_id(0) > 0
+ && get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
- && get_local_id(1) > 0
+ && get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
__kernel
void icvFindMaximaInLayer(
- __global float * det,
- __global float * trace,
- __global int4 * maxPosBuffer,
+ __global float * det,
+ __global float * trace,
+ __global int4 * maxPosBuffer,
volatile __global int* maxCounter,
int counter_offset,
int det_step, // the step of det in bytes
int l_x = min(max(j, 0), c_img_cols - 1);
int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1);
- N9[localLin - zoff] =
+ N9[localLin - zoff] =
det[det_step * (l_y - c_layer_rows) + l_x];
- N9[localLin ] =
+ N9[localLin ] =
det[det_step * (l_y ) + l_x];
- N9[localLin + zoff] =
+ N9[localLin + zoff] =
det[det_step * (l_y + c_layer_rows) + l_x];
barrier(CLK_LOCAL_MEM_FENCE);
- if (i < c_layer_rows - margin
+ if (i < c_layer_rows - margin
&& j < c_layer_cols - margin
- && get_local_id(0) > 0
+ && get_local_id(0) > 0
&& get_local_id(0) < get_local_size(0) - 1
- && get_local_id(1) > 0
+ && get_local_id(1) > 0
&& get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA
)
{
{
F invdet = 1.0 / det;
- x[0] = invdet *
+ x[0] = invdet *
(b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) +
A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] ));
- x[1] = invdet *
+ x[1] = invdet *
(A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) -
b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0]));
- x[2] = invdet *
+ x[2] = invdet *
(A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) -
A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) +
b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]));
////////////////////////////////////////////////////////////////////////
// INTERPOLATION
-__kernel
+__kernel
void icvInterpolateKeypoint(
- __global const float * det,
+ __global const float * det,
__global const int4 * maxPosBuffer,
__global float * keypoints,
volatile __global int * featureCounter,
volatile __local float N9[3][3][3];
- N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
+ N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] =
det[det_step * (c_layer_rows * layer + i) + j];
barrier(CLK_LOCAL_MEM_FENCE);
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
-__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
- 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
- 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
- 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
- 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
- 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
- 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
- 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
- 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
- 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
- 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
- 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
- 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
- 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
+__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f,
+ 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f,
+ 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f,
+ 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f,
+ 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f,
+ 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f,
+ 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f,
+ 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f,
+ 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f,
+ 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f,
+ 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f,
+ 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f,
+ 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f,
+ 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f,
0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f,
- 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
- 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
- 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
+ 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f,
+ 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f,
+ 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f,
0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f,
- 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
- 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
+ 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f,
+ 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f,
0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f,
0.001707611023448408f, 0.001455130288377404f};
data[tid] = *partial_reduction;
barrier(CLK_LOCAL_MEM_FENCE);
- if (tid < 16)
+ if (tid < 16)
{
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
- data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
+ data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
}
#undef op
}
// utility for linear filter
inline uchar readerGet(
- IMAGE_INT8 src,
- const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
+ IMAGE_INT8 src,
+ const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
int i, int j, int rows, int cols, int elemPerRow
)
{
}
inline float linearFilter(
- IMAGE_INT8 src,
- const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
+ IMAGE_INT8 src,
+ const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir,
float y, float x, int rows, int cols, int elemPerRow
)
{
volatile __local float s_dx_bin[25],
volatile __local float s_dy_bin[25],
volatile __local float s_PATCH[6][6],
- __global const float* featureX,
- __global const float* featureY,
- __global const float* featureSize,
+ __global const float* featureX,
+ __global const float* featureY,
+ __global const float* featureSize,
__global const float* featureDir,
int rows,
int cols,
const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
const float vx = (
- s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
- s_PATCH[get_local_id(1) ][get_local_id(0) ] +
- s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
- s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
+ s_PATCH[get_local_id(1) ][get_local_id(0) + 1] -
+ s_PATCH[get_local_id(1) ][get_local_id(0) ] +
+ s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
+ s_PATCH[get_local_id(1) + 1][get_local_id(0) ])
* dw;
const float vy = (
- s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
- s_PATCH[get_local_id(1) ][get_local_id(0) ] +
- s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
- s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
+ s_PATCH[get_local_id(1) + 1][get_local_id(0) ] -
+ s_PATCH[get_local_id(1) ][get_local_id(0) ] +
+ s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] -
+ s_PATCH[get_local_id(1) ][get_local_id(0) + 1])
* dw;
s_dx_bin[tid] = vx;
s_dy_bin[tid] = vy;
}
}
void reduce_sum25(
- volatile __local float* sdata1,
- volatile __local float* sdata2,
- volatile __local float* sdata3,
- volatile __local float* sdata4,
+ volatile __local float* sdata1,
+ volatile __local float* sdata2,
+ volatile __local float* sdata3,
+ volatile __local float* sdata4,
int tid
)
{
}
}
-__kernel
+__kernel
void compute_descriptors64(
IMAGE_INT8 imgTex,
- volatile __global float * descriptors,
+ volatile __global float * descriptors,
__global const float * keypoints,
int descriptors_step,
- int keypoints_step,
+ int keypoints_step,
int rows,
int cols,
int img_step
if (tid < 25)
{
reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
- }
+ }
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < 25)
{
}
}
}
-__kernel
+__kernel
void compute_descriptors128(
IMAGE_INT8 imgTex,
- __global volatile float * descriptors,
+ __global volatile float * descriptors,
__global float * keypoints,
int descriptors_step,
int keypoints_step,
}
}
-__kernel
+__kernel
void normalize_descriptors128(__global float * descriptors, int descriptors_step)
{
descriptors_step /= sizeof(*descriptors);
// normalize and store in output
descriptor_base[get_local_id(0)] = lookup / len;
}
-__kernel
+__kernel
void normalize_descriptors64(__global float * descriptors, int descriptors_step)
{
descriptors_step /= sizeof(*descriptors);
////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- __global uchar *mat_dst2, int dst2_step, int dst2_offset,
+ __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+ __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+ __global uchar *mat_dst2, int dst2_step, int dst2_offset,
__global uchar *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
+ int src_idx = mad24(y, src_step, src_offset + (x << 2));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
- int dst3_start = mad24(y, dst3_step, dst3_offset);
+ int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
-
- uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
- uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
- uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
- uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
- int total_bytes = src_offset + rows * src_step;
- uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
- uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
- uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
+ uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
+ uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
+ uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
+ uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
+
+ int total_bytes = src_offset + rows * src_step;
+ uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
+ uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
+ uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
}
__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- __global uchar *mat_dst2, int dst2_step, int dst2_offset,
+ __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+ __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+ __global uchar *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
+
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3;
- tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
+ tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
uchar4 data0, data1, data2;
-
+
data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
}
__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
+ __global uchar *mat_dst0, int dst0_step, int dst0_offset,
+ __global uchar *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
+ int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+ int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
if(src_idx_0 == -6)
}
__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- __global char *mat_dst2, int dst2_step, int dst2_offset,
+ __global char *mat_dst0, int dst0_step, int dst0_offset,
+ __global char *mat_dst1, int dst1_step, int dst1_offset,
+ __global char *mat_dst2, int dst2_step, int dst2_offset,
__global char *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
+ int src_idx = mad24(y, src_step, src_offset + (x << 2));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
- int dst3_start = mad24(y, dst3_step, dst3_offset);
+ int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
-
- char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
- char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
- char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
- char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
- char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
- char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
- char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
+
+ char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
+ char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
+ char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
+ char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
+ char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
+ char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
+ char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
}
__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- __global char *mat_dst2, int dst2_step, int dst2_offset,
+ __global char *mat_dst0, int dst0_step, int dst0_offset,
+ __global char *mat_dst1, int dst1_step, int dst1_offset,
+ __global char *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
+
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3;
- tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
+ tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
char4 data0, data1, data2;
-
+
data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
}
__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
+ __global char *mat_dst0, int dst0_step, int dst0_offset,
+ __global char *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
+ int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
+ int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
char8 src_data_0 = vload8(0, mat_src + src_idx_0);
char8 src_data_1 = vload8(0, mat_src + src_idx_1);
if(src_idx_0 == -6)
}
__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- __global ushort *mat_dst2, int dst2_step, int dst2_offset,
+ __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+ __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+ __global ushort *mat_dst2, int dst2_step, int dst2_offset,
__global ushort *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
- int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
- int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
+ int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
+ int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
- int dst3_start = mad24(y, dst3_step, dst3_offset);
+ int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
if(src_idx_0 == -6)
src_data0.s01234567 = src_data0.s67012345;
}
__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- __global ushort *mat_dst2, int dst2_step, int dst2_offset,
+ __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+ __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+ __global ushort *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
+
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
}
__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
+ __global ushort *mat_dst0, int dst0_step, int dst0_offset,
+ __global ushort *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
+ int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+ int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
- if(src_idx_0 < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
- src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
- }
- if(src_idx_1 < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
- src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
- }
-
+ if(src_idx_0 < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+ src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
+ }
+ if(src_idx_1 < 0)
+ {
+ ushort4 tmp;
+ tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
+ src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
+ }
+
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
}
}
__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- __global short *mat_dst2, int dst2_step, int dst2_offset,
+ __global short *mat_dst0, int dst0_step, int dst0_offset,
+ __global short *mat_dst1, int dst1_step, int dst1_offset,
+ __global short *mat_dst2, int dst2_step, int dst2_offset,
__global short *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
- int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
- int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
+ int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
+ int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
- int dst3_start = mad24(y, dst3_step, dst3_offset);
+ int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
-
+
if(src_idx_0 == -6)
src_data0.s01234567 = src_data0.s67012345;
if(src_idx_0 == -4)
src_data0.s01234567 = src_data0.s45670123;
if(src_idx_0 == -2)
src_data0.s01234567 = src_data0.s23456701;
-
+
short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
}
}
__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- __global short *mat_dst2, int dst2_step, int dst2_offset,
+ __global short *mat_dst0, int dst0_step, int dst0_offset,
+ __global short *mat_dst1, int dst1_step, int dst1_offset,
+ __global short *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int dst2_start = mad24(y, dst2_step, dst2_offset);
+ int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
+
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
+ __global short *mat_dst0, int dst0_step, int dst0_offset,
+ __global short *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
+ int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
+ int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
- int dst0_start = mad24(y, dst0_step, dst0_offset);
+ int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
- int dst1_start = mad24(y, dst1_step, dst1_offset);
+ int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
+ int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
+ int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
- if(src_idx_0 < 0)
- {
- short4 tmp;
- tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
- src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
- }
- if(src_idx_1< 0)
- {
- short4 tmp;
- tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
- src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
- }
-
+ if(src_idx_0 < 0)
+ {
+ short4 tmp;
+ tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
+ src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+ }
+ if(src_idx_1< 0)
+ {
+ short4 tmp;
+ tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
+ src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
+ }
+
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
}
}
__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- __global int *mat_dst2, int dst2_step, int dst2_offset,
+ __global int *mat_dst0, int dst0_step, int dst0_offset,
+ __global int *mat_dst1, int dst1_step, int dst1_offset,
+ __global int *mat_dst2, int dst2_step, int dst2_offset,
__global int *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
+
int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
}
}
__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- __global int *mat_dst2, int dst2_step, int dst2_offset,
+ __global int *mat_dst0, int dst0_step, int dst0_offset,
+ __global int *mat_dst1, int dst1_step, int dst1_offset,
+ __global int *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
}
__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
+ __global int *mat_dst0, int dst0_step, int dst0_offset,
+ __global int *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
+
int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
}
__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- __global float *mat_dst2, int dst2_step, int dst2_offset,
+ __global float *mat_dst0, int dst0_step, int dst0_offset,
+ __global float *mat_dst1, int dst1_step, int dst1_offset,
+ __global float *mat_dst2, int dst2_step, int dst2_offset,
__global float *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
+
float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
}
__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- __global float *mat_dst2, int dst2_step, int dst2_offset,
+ __global float *mat_dst0, int dst0_step, int dst0_offset,
+ __global float *mat_dst1, int dst1_step, int dst1_offset,
+ __global float *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
}
__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
+ __global float *mat_dst0, int dst0_step, int dst0_offset,
+ __global float *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
+
float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
#if defined (DOUBLE_SUPPORT)
__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- __global double *mat_dst2, int dst2_step, int dst2_offset,
+ __global double *mat_dst0, int dst0_step, int dst0_offset,
+ __global double *mat_dst1, int dst1_step, int dst1_offset,
+ __global double *mat_dst2, int dst2_step, int dst2_offset,
__global double *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1)
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
+
double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
}
__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- __global double *mat_dst2, int dst2_step, int dst2_offset,
+ __global double *mat_dst0, int dst0_step, int dst0_offset,
+ __global double *mat_dst1, int dst1_step, int dst1_offset,
+ __global double *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset);
}
__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
+ __global double *mat_dst0, int dst0_step, int dst0_offset,
+ __global double *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if((x < cols) && (y < rows))
{
- int src_idx = mad24(y, src_step, src_offset);
+ int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
+
double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
return a * a;
}
-unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
+unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache,
volatile __local unsigned int *col_ssd, int radius)
-{
+{
unsigned int cache = 0;
unsigned int cache2 = 0;
return col_ssd[0] + cache + cache2;
}
-uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
+uint2 MinSSD(volatile __local unsigned int *col_ssd_cache,
volatile __local unsigned int *col_ssd, int radius)
{
unsigned int ssd[N_DISPARITIES];
return (uint2)(mssd, bestIdx);
}
-void StepDown(int idx1, int idx2, __global unsigned char* imageL,
+void StepDown(int idx1, int idx2, __global unsigned char* imageL,
__global unsigned char* imageR, int d, volatile __local unsigned int *col_ssd, int radius)
{
unsigned char leftPixel1;
col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1);
}
-void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
- __global unsigned char* imageR, int d,
+void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL,
+ __global unsigned char* imageR, int d,
volatile __local unsigned int *col_ssd, int radius)
{
unsigned char leftPixel1;
col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7];
}
-__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
+__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right,
__global unsigned int *cminSSDImage, int cminSSD_step,
__global unsigned char *disp, int disp_step,int cwidth, int cheight,
- int img_step, int maxdisp, int radius,
+ int img_step, int maxdisp, int radius,
__local unsigned int *col_ssd_cache)
{
volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0);
- volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
+ volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0;
int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius;
// int Y = get_group_id(1) * ROWSperTHREAD + radius;
int idx1 = y_tex * img_step + x_tex;
int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex;
- barrier(CLK_GLOBAL_MEM_FENCE);
- barrier(CLK_LOCAL_MEM_FENCE);
+ barrier(CLK_GLOBAL_MEM_FENCE);
+ barrier(CLK_LOCAL_MEM_FENCE);
StepDown(idx1, idx2, left, right, d, col_ssd, radius);
if (col_ssd_extra > 0)
y_tex += 1;
- barrier(CLK_LOCAL_MEM_FENCE);
+ barrier(CLK_LOCAL_MEM_FENCE);
if (X < cwidth - radius && row < cheight - radius - Y)
{
//////////////////////////// Sobel Prefiler (signal channel)//////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output,
+__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output,
int rows, int cols, int prefilterCap)
{
int x = get_global_id(0);
if(x < cols && y < rows)
{
- int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) +
+ int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) +
input[(y) * cols + (x-1)] * (-2) + input[(y) * cols + (x+1)] * (2) +
input[(y+1) * cols + (x-1)] * (-1) + input[(y+1) * cols + (x+1)] * (1);
int x1 = x==0? 0 : x-1;
if(x < cols && y < rows)
{
- conv = (float)input[(y1) * cols + (x1)] * (-1) + (float)input[(y1) * cols + (x+1)] * (1) +
+ conv = (float)input[(y1) * cols + (x1)] * (-1) + (float)input[(y1) * cols + (x+1)] * (1) +
(float)input[(y) * cols + (x1)] * (-2) + (float)input[(y) * cols + (x+1)] * (2) +
(float)input[(y+1) * cols + (x1)] * (-1) + (float)input[(y+1) * cols + (x+1)] * (1);
-
+
}
return fabs(conv);
}
}
#define RpT (2 * ROWSperTHREAD) // got experimentally
-__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols,
- int disp_step, __global unsigned char *input, int input_rows,
- int input_cols,int winsz, float threshold,
+__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols,
+ int disp_step, __global unsigned char *input, int input_rows,
+ int input_cols,int winsz, float threshold,
__local float *cols_cache)
{
int winsz2 = winsz/2;
for(int y = beg_row + 1; y < end_row; ++y)
{
- sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
+ sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) +
sobel(input, x - winsz2, y + winsz2, input_rows, input_cols);
*cols = sum;
if (cols_extra)
{
- sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
+ sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols)
+ sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols);
*cols_extra = sum_extra;
}