cmake_minimum_required(VERSION 2.6.3)
endif()
+if(POLICY CMP0017)
+ cmake_policy(SET CMP0017 NEW)
+endif()
+
if(POLICY CMP0022)
cmake_policy(SET CMP0022 OLD)
endif()
set(CMAKE_MODULE_PATH "${OpenCV_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
-find_host_package(CUDA 4.2 QUIET)
+foreach(var INCLUDE LIBRARY PROGRAM)
+ set(__old_frpm_${var} "${CMAKE_FIND_ROOT_PATH_MODE_${var}}")
+endforeach()
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+
+find_package(CUDA 4.2 QUIET)
+
+foreach(var INCLUDE LIBRARY PROGRAM)
+ set(CMAKE_FIND_ROOT_PATH_MODE_${var} "${__old_frpm_${var}}")
+endforeach()
+
+list(REMOVE_AT CMAKE_MODULE_PATH 0)
if(CUDA_FOUND)
set(HAVE_CUDA 1)
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
file(GLOB cl_kernels "src/opencl/*.cl")
- if(HAVE_OPENCL AND cl_kernels)
+ if(HAVE_opencv_ocl AND cl_kernels)
ocv_include_directories(${OPENCL_INCLUDE_DIRS})
add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.cpp" "${CMAKE_CURRENT_BINARY_DIR}/opencl_kernels.hpp"
if(NOT ANDROID)
ocv_list_filterout(handwrittren_java_sources "/(engine|android)\\\\+")
ocv_list_filterout(handwrittren_aidl_sources "/(engine|android)\\\\+")
+ ocv_list_filterout(handwrittren_java_sources "VideoCapture")
+ ocv_list_filterout(handwrittren_cpp_sources "VideoCapture")
else()
file(GLOB_RECURSE handwrittren_lib_project_files_rel RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/android_lib/" "${CMAKE_CURRENT_SOURCE_DIR}/android_lib/*")
list(REMOVE_ITEM handwrittren_lib_project_files_rel "${ANDROID_MANIFEST_FILE}")
# first run of gen_java.py (to get list of generated files)
file(REMOVE_RECURSE "${CMAKE_CURRENT_BINARY_DIR}/gen_java_out/")
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/gen_java_out")
- execute_process(COMMAND ${PYTHON_EXECUTABLE} "${scripts_gen_java}" "${scripts_hdr_parser}" ${module} ${opencv_public_headers_${module}}
+ if (ANDROID)
+ execute_process(COMMAND ${PYTHON_EXECUTABLE} "${scripts_gen_java}" "${scripts_hdr_parser}" "-android" ${module} ${opencv_public_headers_${module}}
WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/gen_java_out"
OUTPUT_QUIET ERROR_QUIET)
+ else()
+ execute_process(COMMAND ${PYTHON_EXECUTABLE} "${scripts_gen_java}" "${scripts_hdr_parser}" ${module} ${opencv_public_headers_${module}}
+ WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/gen_java_out"
+ OUTPUT_QUIET ERROR_QUIET)
+ endif()
unset(generated_java_sources_${module})
file(GLOB_RECURSE generated_java_sources_${module} RELATIVE "${CMAKE_CURRENT_BINARY_DIR}/gen_java_out/" "${CMAKE_CURRENT_BINARY_DIR}/gen_java_out/*.java")
ocv_list_add_prefix(generated_java_sources_${module} "${CMAKE_CURRENT_BINARY_DIR}/")
set(step1_depends "${scripts_gen_java}" "${scripts_hdr_parser}" ${opencv_public_headers})
foreach(module ${OPENCV_JAVA_MODULES})
# second run of gen_java.py (at build time)
- add_custom_command(OUTPUT ${generated_java_sources_${module}} "${CMAKE_CURRENT_BINARY_DIR}/${module}.cpp"
+ if (ANDROID)
+ add_custom_command(OUTPUT ${generated_java_sources_${module}} "${CMAKE_CURRENT_BINARY_DIR}/${module}.cpp"
+ COMMAND ${PYTHON_EXECUTABLE} "${scripts_gen_java}" "${scripts_hdr_parser}" "-android" ${module} ${opencv_public_headers_${module}}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+ DEPENDS "${scripts_gen_java}" "${scripts_hdr_parser}" ${opencv_public_headers_${module}}
+ )
+ else()
+ add_custom_command(OUTPUT ${generated_java_sources_${module}} "${CMAKE_CURRENT_BINARY_DIR}/${module}.cpp"
COMMAND ${PYTHON_EXECUTABLE} "${scripts_gen_java}" "${scripts_hdr_parser}" ${module} ${opencv_public_headers_${module}}
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
DEPENDS "${scripts_gen_java}" "${scripts_hdr_parser}" ${opencv_public_headers_${module}}
)
+ endif()
endforeach()
# step 2: generate javadoc comments
hdr_parser_path = os.path.dirname(hdr_parser_path)
sys.path.append(hdr_parser_path)
import hdr_parser
- module = sys.argv[2]
- srcfiles = sys.argv[3:]
+ if (sys.argv[2] == "-android"):
+ class_ignore_list += ("VideoCapture",)
+ ManualFuncs.pop("VideoCapture")
+ module = sys.argv[3]
+ srcfiles = sys.argv[4:]
+ else:
+ module = sys.argv[2]
+ srcfiles = sys.argv[3:]
+
#print "Generating module '" + module + "' from headers:\n\t" + "\n\t".join(srcfiles)
generator = JavaWrapperGenerator()
generator.gen(srcfiles, module, dstdir)
--- /dev/null
+#define LOG_TAG "org.opencv.highgui.VideoCapture"
+#include "common.h"
+
+#include "opencv2/opencv_modules.hpp"
+#ifdef HAVE_OPENCV_HIGHGUI
+
+#include "opencv2/highgui/highgui_c.h"
+#include "opencv2/highgui/highgui.hpp"
+using namespace cv;
+
+
+extern "C" {
+
+//
+// VideoCapture::VideoCapture()
+//
+
+JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_n_1VideoCapture__
+ (JNIEnv* env, jclass);
+
+JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_n_1VideoCapture__
+ (JNIEnv* env, jclass)
+{
+ try {
+ LOGD("highgui::VideoCapture_n_1VideoCapture__()");
+
+ VideoCapture* _retval_ = new VideoCapture( );
+
+ return (jlong) _retval_;
+ } catch(cv::Exception e) {
+ LOGD("highgui::VideoCapture_n_1VideoCapture__() catched cv::Exception: %s", e.what());
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+ LOGD("highgui::VideoCapture_n_1VideoCapture__() catched unknown exception (...)");
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1VideoCapture__()}");
+ return 0;
+ }
+}
+
+
+//
+// VideoCapture::VideoCapture(int device)
+//
+
+JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_n_1VideoCapture__I
+ (JNIEnv* env, jclass, jint device);
+
+JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_n_1VideoCapture__I
+ (JNIEnv* env, jclass, jint device)
+{
+ try {
+ LOGD("highgui::VideoCapture_n_1VideoCapture__I()");
+
+ VideoCapture* _retval_ = new VideoCapture( device );
+
+ return (jlong) _retval_;
+ } catch(cv::Exception e) {
+ LOGD("highgui::VideoCapture_n_1VideoCapture__I() catched cv::Exception: %s", e.what());
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+ LOGD("highgui::VideoCapture_n_1VideoCapture__I() catched unknown exception (...)");
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1VideoCapture__I()}");
+ return 0;
+ }
+}
+
+
+
+//
+// double VideoCapture::get(int propId)
+//
+
+JNIEXPORT jdouble JNICALL Java_org_opencv_highgui_VideoCapture_n_1get
+ (JNIEnv* env, jclass, jlong self, jint propId);
+
+JNIEXPORT jdouble JNICALL Java_org_opencv_highgui_VideoCapture_n_1get
+ (JNIEnv* env, jclass, jlong self, jint propId)
+{
+ try {
+ LOGD("highgui::VideoCapture_n_1get()");
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ double _retval_ = me->get( propId );
+
+ return _retval_;
+ } catch(cv::Exception e) {
+ LOGD("highgui::VideoCapture_n_1get() catched cv::Exception: %s", e.what());
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+ LOGD("highgui::VideoCapture_n_1get() catched unknown exception (...)");
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1get()}");
+ return 0;
+ }
+}
+
+
+
+//
+// bool VideoCapture::grab()
+//
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1grab
+ (JNIEnv* env, jclass, jlong self);
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1grab
+ (JNIEnv* env, jclass, jlong self)
+{
+ try {
+ LOGD("highgui::VideoCapture_n_1grab()");
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ bool _retval_ = me->grab( );
+
+ return _retval_;
+ } catch(cv::Exception e) {
+ LOGD("highgui::VideoCapture_n_1grab() catched cv::Exception: %s", e.what());
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+ LOGD("highgui::VideoCapture_n_1grab() catched unknown exception (...)");
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1grab()}");
+ return 0;
+ }
+}
+
+
+
+//
+// bool VideoCapture::isOpened()
+//
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1isOpened
+ (JNIEnv* env, jclass, jlong self);
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1isOpened
+ (JNIEnv* env, jclass, jlong self)
+{
+ try {
+ LOGD("highgui::VideoCapture_n_1isOpened()");
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ bool _retval_ = me->isOpened( );
+
+ return _retval_;
+ } catch(cv::Exception e) {
+ LOGD("highgui::VideoCapture_n_1isOpened() catched cv::Exception: %s", e.what());
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+ LOGD("highgui::VideoCapture_n_1isOpened() catched unknown exception (...)");
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1isOpened()}");
+ return 0;
+ }
+}
+
+
+//
+// bool VideoCapture::open(int device)
+//
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1open__JI
+ (JNIEnv* env, jclass, jlong self, jint device);
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1open__JI
+ (JNIEnv* env, jclass, jlong self, jint device)
+{
+ try {
+ LOGD("highgui::VideoCapture_n_1open__JI()");
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ bool _retval_ = me->open( device );
+
+ return _retval_;
+ } catch(cv::Exception e) {
+ LOGD("highgui::VideoCapture_n_1open__JI() catched cv::Exception: %s", e.what());
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+ LOGD("highgui::VideoCapture_n_1open__JI() catched unknown exception (...)");
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1open__JI()}");
+ return 0;
+ }
+}
+
+
+
+//
+// bool VideoCapture::read(Mat image)
+//
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1read
+ (JNIEnv* env, jclass, jlong self, jlong image_nativeObj);
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1read
+ (JNIEnv* env, jclass, jlong self, jlong image_nativeObj)
+{
+ try {
+ LOGD("highgui::VideoCapture_n_1read()");
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ Mat& image = *((Mat*)image_nativeObj);
+ bool _retval_ = me->read( image );
+
+ return _retval_;
+ } catch(cv::Exception e) {
+ LOGD("highgui::VideoCapture_n_1read() catched cv::Exception: %s", e.what());
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+ LOGD("highgui::VideoCapture_n_1read() catched unknown exception (...)");
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1read()}");
+ return 0;
+ }
+}
+
+
+
+//
+// void VideoCapture::release()
+//
+
+JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_n_1release
+ (JNIEnv* env, jclass, jlong self);
+
+JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_n_1release
+ (JNIEnv* env, jclass, jlong self)
+{
+ try {
+
+ LOGD("highgui::VideoCapture_n_1release()");
+
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ me->release( );
+
+ return;
+ } catch(cv::Exception e) {
+
+ LOGD("highgui::VideoCapture_n_1release() catched cv::Exception: %s", e.what());
+
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return;
+ } catch (...) {
+
+ LOGD("highgui::VideoCapture_n_1release() catched unknown exception (...)");
+
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1release()}");
+ return;
+ }
+}
+
+
+
+//
+// bool VideoCapture::retrieve(Mat image, int channel = 0)
+//
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1retrieve__JJI
+ (JNIEnv* env, jclass, jlong self, jlong image_nativeObj, jint channel);
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1retrieve__JJI
+ (JNIEnv* env, jclass, jlong self, jlong image_nativeObj, jint channel)
+{
+ try {
+
+ LOGD("highgui::VideoCapture_n_1retrieve__JJI()");
+
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ Mat& image = *((Mat*)image_nativeObj);
+ bool _retval_ = me->retrieve( image, channel );
+
+ return _retval_;
+ } catch(cv::Exception e) {
+
+ LOGD("highgui::VideoCapture_n_1retrieve__JJI() catched cv::Exception: %s", e.what());
+
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+
+ LOGD("highgui::VideoCapture_n_1retrieve__JJI() catched unknown exception (...)");
+
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1retrieve__JJI()}");
+ return 0;
+ }
+}
+
+
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1retrieve__JJ
+ (JNIEnv* env, jclass, jlong self, jlong image_nativeObj);
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1retrieve__JJ
+ (JNIEnv* env, jclass, jlong self, jlong image_nativeObj)
+{
+ try {
+
+ LOGD("highgui::VideoCapture_n_1retrieve__JJ()");
+
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ Mat& image = *((Mat*)image_nativeObj);
+ bool _retval_ = me->retrieve( image );
+
+ return _retval_;
+ } catch(cv::Exception e) {
+
+ LOGD("highgui::VideoCapture_n_1retrieve__JJ() catched cv::Exception: %s", e.what());
+
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+
+ LOGD("highgui::VideoCapture_n_1retrieve__JJ() catched unknown exception (...)");
+
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1retrieve__JJ()}");
+ return 0;
+ }
+}
+
+
+
+//
+// bool VideoCapture::set(int propId, double value)
+//
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1set
+ (JNIEnv* env, jclass, jlong self, jint propId, jdouble value);
+
+JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1set
+ (JNIEnv* env, jclass, jlong self, jint propId, jdouble value)
+{
+ try {
+
+ LOGD("highgui::VideoCapture_n_1set()");
+
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ bool _retval_ = me->set( propId, value );
+
+ return _retval_;
+ } catch(cv::Exception e) {
+
+ LOGD("highgui::VideoCapture_n_1set() catched cv::Exception: %s", e.what());
+
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return 0;
+ } catch (...) {
+
+ LOGD("highgui::VideoCapture_n_1set() catched unknown exception (...)");
+
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1set()}");
+ return 0;
+ }
+}
+
+JNIEXPORT jstring JNICALL Java_org_opencv_highgui_VideoCapture_n_1getSupportedPreviewSizes
+ (JNIEnv *env, jclass, jlong self);
+
+JNIEXPORT jstring JNICALL Java_org_opencv_highgui_VideoCapture_n_1getSupportedPreviewSizes
+ (JNIEnv *env, jclass, jlong self)
+{
+ try {
+
+ LOGD("highgui::VideoCapture_n_1set()");
+
+ VideoCapture* me = (VideoCapture*) self; //TODO: check for NULL
+ union {double prop; const char* name;} u;
+ u.prop = me->get(CV_CAP_PROP_SUPPORTED_PREVIEW_SIZES_STRING);
+ return env->NewStringUTF(u.name);
+ } catch(cv::Exception e) {
+
+ LOGD("highgui::VideoCapture_n_1getSupportedPreviewSizes() catched cv::Exception: %s", e.what());
+
+ jclass je = env->FindClass("org/opencv/core/CvException");
+ if(!je) je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, e.what());
+ return env->NewStringUTF("");
+ } catch (...) {
+
+ LOGD("highgui::VideoCapture_n_1getSupportedPreviewSizes() catched unknown exception (...)");
+
+ jclass je = env->FindClass("java/lang/Exception");
+ env->ThrowNew(je, "Unknown exception in JNI code {highgui::VideoCapture_n_1getSupportedPreviewSizes()}");
+ return env->NewStringUTF("");
+ }
+}
+
+
+
+//
+// native support for java finalize()
+// static void VideoCapture::n_delete( __int64 self )
+//
+
+JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_n_1delete
+ (JNIEnv*, jclass, jlong self);
+
+JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_n_1delete
+ (JNIEnv*, jclass, jlong self)
+{
+ delete (VideoCapture*) self;
+}
+
+} // extern "C"
+
+#endif // HAVE_OPENCV_HIGHGUI
\ No newline at end of file
+++ /dev/null
-// emulating the 'old' JNI names existed before the VideoCapture wrapping became automatic
-
-#define LOG_TAG "org.opencv.highgui.VideoCapture"
-#include "common.h"
-
-#include "opencv2/opencv_modules.hpp"
-#ifdef HAVE_OPENCV_HIGHGUI
-
-#include "opencv2/core/version.hpp"
-
-#if (CV_VERSION_EPOCH == 2) && (CV_VERSION_MAJOR == 4)
-extern "C" {
-
-JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_n_1VideoCapture__
- (JNIEnv* env, jclass c);
-
-JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_VideoCapture_10 (JNIEnv*, jclass);
-
-JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_n_1VideoCapture__
- (JNIEnv* env, jclass c)
-{
- return Java_org_opencv_highgui_VideoCapture_VideoCapture_10(env, c);
-}
-
-
-JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_n_1VideoCapture__I
- (JNIEnv* env, jclass c, jint device);
-
-JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_VideoCapture_12 (JNIEnv*, jclass, jint);
-
-JNIEXPORT jlong JNICALL Java_org_opencv_highgui_VideoCapture_n_1VideoCapture__I
- (JNIEnv* env, jclass c, jint device)
-{
- return Java_org_opencv_highgui_VideoCapture_VideoCapture_12(env, c, device);
-}
-
-
-JNIEXPORT jdouble JNICALL Java_org_opencv_highgui_VideoCapture_n_1get
- (JNIEnv* env, jclass c, jlong self, jint propId);
-
-JNIEXPORT jdouble JNICALL Java_org_opencv_highgui_VideoCapture_get_10 (JNIEnv*, jclass, jlong, jint);
-
-JNIEXPORT jdouble JNICALL Java_org_opencv_highgui_VideoCapture_n_1get
- (JNIEnv* env, jclass c, jlong self, jint propId)
-{
- return Java_org_opencv_highgui_VideoCapture_get_10(env, c, self, propId);
-}
-
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1grab
- (JNIEnv* env, jclass c, jlong self);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_grab_10 (JNIEnv*, jclass, jlong);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1grab
- (JNIEnv* env, jclass c, jlong self)
-{
- return Java_org_opencv_highgui_VideoCapture_grab_10(env, c, self);
-}
-
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1isOpened
- (JNIEnv* env, jclass c, jlong self);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_isOpened_10 (JNIEnv*, jclass, jlong);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1isOpened
- (JNIEnv* env, jclass c, jlong self)
-{
- return Java_org_opencv_highgui_VideoCapture_isOpened_10(env, c, self);
-}
-
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1open__JI
- (JNIEnv* env, jclass c, jlong self, jint device);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_open_11 (JNIEnv*, jclass, jlong, jint);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1open__JI
- (JNIEnv* env, jclass c, jlong self, jint device)
-{
- return Java_org_opencv_highgui_VideoCapture_open_11(env, c, self, device);
-}
-
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1read
- (JNIEnv* env, jclass c, jlong self, jlong image_nativeObj);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_read_10 (JNIEnv*, jclass, jlong, jlong);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1read
- (JNIEnv* env, jclass c, jlong self, jlong image_nativeObj)
-{
- return Java_org_opencv_highgui_VideoCapture_read_10(env, c, self, image_nativeObj);
-}
-
-
-JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_n_1release
- (JNIEnv* env, jclass c, jlong self);
-
-JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_release_10 (JNIEnv*, jclass, jlong);
-
-JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_n_1release
- (JNIEnv* env, jclass c, jlong self)
-{
- Java_org_opencv_highgui_VideoCapture_release_10(env, c, self);
-}
-
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1retrieve__JJI
- (JNIEnv* env, jclass c, jlong self, jlong image_nativeObj, jint channel);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_retrieve_10 (JNIEnv*, jclass, jlong, jlong, jint);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1retrieve__JJI
- (JNIEnv* env, jclass c, jlong self, jlong image_nativeObj, jint channel)
-{
- return Java_org_opencv_highgui_VideoCapture_retrieve_10(env, c, self, image_nativeObj, channel);
-}
-
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1retrieve__JJ
- (JNIEnv* env, jclass c, jlong self, jlong image_nativeObj);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_retrieve_11 (JNIEnv*, jclass, jlong, jlong);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1retrieve__JJ
- (JNIEnv* env, jclass c, jlong self, jlong image_nativeObj)
-{
- return Java_org_opencv_highgui_VideoCapture_retrieve_11(env, c, self, image_nativeObj);
-}
-
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1set
- (JNIEnv* env, jclass c, jlong self, jint propId, jdouble value);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_set_10 (JNIEnv*, jclass, jlong, jint, jdouble);
-
-JNIEXPORT jboolean JNICALL Java_org_opencv_highgui_VideoCapture_n_1set
- (JNIEnv* env, jclass c, jlong self, jint propId, jdouble value)
-{
- return Java_org_opencv_highgui_VideoCapture_set_10(env, c, self, propId, value);
-}
-
-
-JNIEXPORT jstring JNICALL Java_org_opencv_highgui_VideoCapture_n_1getSupportedPreviewSizes
- (JNIEnv *env, jclass c, jlong self);
-
-JNIEXPORT jstring JNICALL Java_org_opencv_highgui_VideoCapture_getSupportedPreviewSizes_10
- (JNIEnv *env, jclass, jlong self);
-
-JNIEXPORT jstring JNICALL Java_org_opencv_highgui_VideoCapture_n_1getSupportedPreviewSizes
- (JNIEnv *env, jclass c, jlong self)
-{
- return Java_org_opencv_highgui_VideoCapture_getSupportedPreviewSizes_10(env, c, self);
-}
-
-
-JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_n_1delete
- (JNIEnv *env, jclass c, jlong self);
-
-JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_delete(JNIEnv*, jclass, jlong);
-
-JNIEXPORT void JNICALL Java_org_opencv_highgui_VideoCapture_n_1delete
- (JNIEnv *env, jclass c, jlong self)
-{
- Java_org_opencv_highgui_VideoCapture_delete(env, c, self);
-}
-
-
-} // extern "C"
-#endif // (CV_VERSION_EPOCH == 2) && (CV_VERSION_MAJOR == 4)
-#endif // HAVE_OPENCV_HIGHGUI
--- /dev/null
+package org.opencv.highgui;
+
+import java.util.List;
+import java.util.LinkedList;
+
+import org.opencv.core.Mat;
+import org.opencv.core.Size;
+
+// C++: class VideoCapture
+//javadoc: VideoCapture
+public class VideoCapture {
+
+ protected final long nativeObj;
+
+ protected VideoCapture(long addr) {
+ nativeObj = addr;
+ }
+
+ //
+ // C++: VideoCapture::VideoCapture()
+ //
+
+ // javadoc: VideoCapture::VideoCapture()
+ public VideoCapture()
+ {
+
+ nativeObj = n_VideoCapture();
+
+ return;
+ }
+
+ //
+ // C++: VideoCapture::VideoCapture(int device)
+ //
+
+ // javadoc: VideoCapture::VideoCapture(device)
+ public VideoCapture(int device)
+ {
+
+ nativeObj = n_VideoCapture(device);
+
+ return;
+ }
+
+ //
+ // C++: double VideoCapture::get(int propId)
+ //
+
+/**
+ * Returns the specified "VideoCapture" property.
+ *
+ * Note: When querying a property that is not supported by the backend used by
+ * the "VideoCapture" class, value 0 is returned.
+ *
+ * @param propId property identifier; it can be one of the following:
+ * * CV_CAP_PROP_FRAME_WIDTH width of the frames in the video stream.
+ * * CV_CAP_PROP_FRAME_HEIGHT height of the frames in the video stream.
+ *
+ * @see <a href="http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#videocapture-get">org.opencv.highgui.VideoCapture.get</a>
+ */
+ public double get(int propId)
+ {
+
+ double retVal = n_get(nativeObj, propId);
+
+ return retVal;
+ }
+
+ public List<Size> getSupportedPreviewSizes()
+ {
+ String[] sizes_str = n_getSupportedPreviewSizes(nativeObj).split(",");
+ List<Size> sizes = new LinkedList<Size>();
+
+ for (String str : sizes_str) {
+ String[] wh = str.split("x");
+ sizes.add(new Size(Double.parseDouble(wh[0]), Double.parseDouble(wh[1])));
+ }
+
+ return sizes;
+ }
+
+ //
+ // C++: bool VideoCapture::grab()
+ //
+
+ // javadoc: VideoCapture::grab()
+ public boolean grab()
+ {
+
+ boolean retVal = n_grab(nativeObj);
+
+ return retVal;
+ }
+
+ //
+ // C++: bool VideoCapture::isOpened()
+ //
+
+ // javadoc: VideoCapture::isOpened()
+ public boolean isOpened()
+ {
+
+ boolean retVal = n_isOpened(nativeObj);
+
+ return retVal;
+ }
+
+ //
+ // C++: bool VideoCapture::open(int device)
+ //
+
+ // javadoc: VideoCapture::open(device)
+ public boolean open(int device)
+ {
+
+ boolean retVal = n_open(nativeObj, device);
+
+ return retVal;
+ }
+
+ //
+ // C++: bool VideoCapture::read(Mat image)
+ //
+
+ // javadoc: VideoCapture::read(image)
+ public boolean read(Mat image)
+ {
+
+ boolean retVal = n_read(nativeObj, image.nativeObj);
+
+ return retVal;
+ }
+
+ //
+ // C++: void VideoCapture::release()
+ //
+
+ // javadoc: VideoCapture::release()
+ public void release()
+ {
+
+ n_release(nativeObj);
+
+ return;
+ }
+
+ //
+ // C++: bool VideoCapture::retrieve(Mat image, int channel = 0)
+ //
+
+ // javadoc: VideoCapture::retrieve(image, channel)
+ public boolean retrieve(Mat image, int channel)
+ {
+
+ boolean retVal = n_retrieve(nativeObj, image.nativeObj, channel);
+
+ return retVal;
+ }
+
+ // javadoc: VideoCapture::retrieve(image)
+ public boolean retrieve(Mat image)
+ {
+
+ boolean retVal = n_retrieve(nativeObj, image.nativeObj);
+
+ return retVal;
+ }
+
+ //
+ // C++: bool VideoCapture::set(int propId, double value)
+ //
+
+/**
+ * Sets a property in the "VideoCapture".
+ *
+ * @param propId property identifier; it can be one of the following:
+ * * CV_CAP_PROP_FRAME_WIDTH width of the frames in the video stream.
+ * * CV_CAP_PROP_FRAME_HEIGHT height of the frames in the video stream.
+ * @param value value of the property.
+ *
+ * @see <a href="http://docs.opencv.org/modules/highgui/doc/reading_and_writing_images_and_video.html#videocapture-set">org.opencv.highgui.VideoCapture.set</a>
+ */
+ public boolean set(int propId, double value)
+ {
+
+ boolean retVal = n_set(nativeObj, propId, value);
+
+ return retVal;
+ }
+
+ @Override
+ protected void finalize() throws Throwable {
+ n_delete(nativeObj);
+ super.finalize();
+ }
+
+ // C++: VideoCapture::VideoCapture()
+ private static native long n_VideoCapture();
+
+ // C++: VideoCapture::VideoCapture(string filename)
+ private static native long n_VideoCapture(java.lang.String filename);
+
+ // C++: VideoCapture::VideoCapture(int device)
+ private static native long n_VideoCapture(int device);
+
+ // C++: double VideoCapture::get(int propId)
+ private static native double n_get(long nativeObj, int propId);
+
+ // C++: bool VideoCapture::grab()
+ private static native boolean n_grab(long nativeObj);
+
+ // C++: bool VideoCapture::isOpened()
+ private static native boolean n_isOpened(long nativeObj);
+
+ // C++: bool VideoCapture::open(string filename)
+ private static native boolean n_open(long nativeObj, java.lang.String filename);
+
+ // C++: bool VideoCapture::open(int device)
+ private static native boolean n_open(long nativeObj, int device);
+
+ // C++: bool VideoCapture::read(Mat image)
+ private static native boolean n_read(long nativeObj, long image_nativeObj);
+
+ // C++: void VideoCapture::release()
+ private static native void n_release(long nativeObj);
+
+ // C++: bool VideoCapture::retrieve(Mat image, int channel = 0)
+ private static native boolean n_retrieve(long nativeObj, long image_nativeObj, int channel);
+
+ private static native boolean n_retrieve(long nativeObj, long image_nativeObj);
+
+ // C++: bool VideoCapture::set(int propId, double value)
+ private static native boolean n_set(long nativeObj, int propId, double value);
+
+ private static native String n_getSupportedPreviewSizes(long nativeObj);
+
+ // native support for java finalize()
+ private static native void n_delete(long nativeObj);
+
+}
bool haveDoubleSupport;
bool isUnifiedMemory; // 1 means integrated GPU, otherwise this value is 0
+ bool isIntelDevice;
std::string compilationExtraOptions;
{
FEATURE_CL_DOUBLE = 1,
FEATURE_CL_UNIFIED_MEM,
- FEATURE_CL_VER_1_2
+ FEATURE_CL_VER_1_2,
+ FEATURE_CL_INTEL_DEVICE
};
// Represents OpenCL context, interface
const cv::ocl::ProgramEntry* source, std::string kernelName);
CV_EXPORTS cl_kernel openCLGetKernelFromSource(const Context *clCxt,
const cv::ocl::ProgramEntry* source, std::string kernelName, const char *build_options);
+CV_EXPORTS cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source,
+ string kernelName, int channels, int depth, const char *build_options);
CV_EXPORTS void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
+CV_EXPORTS void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
+ size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args);
CV_EXPORTS void openCLExecuteKernel(Context *clCxt , const cv::ocl::ProgramEntry* source, string kernelName, std::vector< std::pair<size_t, const void *> > &args,
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
CV_EXPORTS void openCLExecuteKernel_(Context *clCxt, const cv::ocl::ProgramEntry* source, std::string kernelName,
OCL_PERF_ELSE
}
+#if 0
+
/////////////////// upload ///////////////////////////
typedef tuple<Size, MatDepth, int> uploadParams;
SANITY_CHECK_NOTHING();
}
+
+#endif
break;
}
if (isRelative)
- r = r / norm(src2, normType);
+ r = r / (norm(src2, normType) + DBL_EPSILON);
return r;
}
{
deviceInfo.info.haveDoubleSupport = false;
}
+
+ size_t intel_platform = platformInfo.info.platformVendor.find("Intel");
+ if(intel_platform != std::string::npos)
+ {
+ deviceInfo.info.compilationExtraOptions += " -D INTEL_DEVICE";
+ deviceInfo.info.isIntelDevice = true;
+ }
+ else
+ {
+ deviceInfo.info.isIntelDevice = false;
+ }
}
}
}
deviceVendorId(-1),
maxWorkGroupSize(0), maxComputeUnits(0), localMemorySize(0), maxMemAllocSize(0),
deviceVersionMajor(0), deviceVersionMinor(0),
- haveDoubleSupport(false), isUnifiedMemory(false),
+ haveDoubleSupport(false), isUnifiedMemory(false),isIntelDevice(false),
platform(NULL)
{
// nothing
{
switch (featureType)
{
+ case FEATURE_CL_INTEL_DEVICE:
+ return deviceInfo.isIntelDevice;
case FEATURE_CL_DOUBLE:
return deviceInfo.haveDoubleSupport;
case FEATURE_CL_UNIFIED_MEM:
return opt;
}
-void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
- size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
+cl_kernel openCLGetKernelFromSource(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, int channels,
int depth, const char *build_options)
{
//construct kernel name
idxStr << "_D" << depth;
kernelName += idxStr.str();
- cl_kernel kernel;
std::string fixedOptions = removeDuplicatedWhiteSpaces(build_options);
- kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
+ cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, fixedOptions.c_str());
+ return kernel;
+}
+void openCLExecuteKernel(Context *ctx, cl_kernel kernel, size_t globalThreads[3],
+ size_t localThreads[3], vector< pair<size_t, const void *> > &args)
+{
if ( localThreads != NULL)
{
globalThreads[0] = roundUp(globalThreads[0], localThreads[0]);
openCLSafeCall(clReleaseKernel(kernel));
}
+void openCLExecuteKernel_(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName, size_t globalThreads[3],
+ size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
+ int depth, const char *build_options)
+{
+ cl_kernel kernel = openCLGetKernelFromSource(ctx, source, kernelName, channels, depth, build_options);
+
+ openCLExecuteKernel(ctx, kernel, globalThreads, localThreads, args);
+}
+
void openCLExecuteKernel(Context *ctx, const cv::ocl::ProgramEntry* source, string kernelName,
size_t globalThreads[3], size_t localThreads[3],
vector< pair<size_t, const void *> > &args, int channels, int depth)
if(status != CL_SUCCESS)
{
- if(status == CL_BUILD_PROGRAM_FAILURE)
+ if (status == CL_BUILD_PROGRAM_FAILURE || status == CL_INVALID_BUILD_OPTIONS)
{
size_t buildLogSize = 0;
openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx),
func(src, dst, kernel, ksize, anchor, borderType) ;
}
- oclMat kernel;
+ Mat kernel;
GPUFilter2D_t func;
};
}
kernelDataFloat.size()*sizeof(float), 1, clMemcpyHostToDevice);
}
- size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
+ size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
+ do {
+ size_t BLOCK_SIZE = tryWorkItems;
+ while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
+ BLOCK_SIZE /= 2;
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
- size_t BLOCK_SIZE_Y = 1;
+ size_t BLOCK_SIZE_Y = 1;
#else
- size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
- while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
- BLOCK_SIZE_Y *= 2;
+ size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
+ while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
+ BLOCK_SIZE_Y *= 2;
#endif
- CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
+ CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
- bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
+ bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
- vector<pair<size_t , const void *> > args;
+ vector<pair<size_t , const void *> > args;
- args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
- cl_uint stepBytes = src.step;
- args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
- int offsetXBytes = src.offset % src.step;
- int offsetX = offsetXBytes / src.elemSize();
- CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
- int offsetY = src.offset / src.step;
- int endX = (offsetX + src.cols);
- int endY = (offsetY + src.rows);
- cl_int rect[4] = {offsetX, offsetY, endX, endY};
- if (!isIsolatedBorder)
- {
- rect[2] = src.wholecols;
- rect[3] = src.wholerows;
- }
- args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
-
- args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
- cl_uint _stepBytes = dst.step;
- args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
- int _offsetXBytes = dst.offset % dst.step;
- int _offsetX = _offsetXBytes / dst.elemSize();
- CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
- int _offsetY = dst.offset / dst.step;
- int _endX = (_offsetX + dst.cols);
- int _endY = (_offsetY + dst.rows);
- cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
- args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
-
- float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
- double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
- if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
- {
- if (useDouble)
- args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
- else
- args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
- }
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+ cl_uint stepBytes = src.step;
+ args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
+ int offsetXBytes = src.offset % src.step;
+ int offsetX = offsetXBytes / src.elemSize();
+ CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
+ int offsetY = src.offset / src.step;
+ int endX = (offsetX + src.cols);
+ int endY = (offsetY + src.rows);
+ cl_int rect[4] = {offsetX, offsetY, endX, endY};
+ if (!isIsolatedBorder)
+ {
+ rect[2] = src.wholecols;
+ rect[3] = src.wholerows;
+ }
+ args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
+
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
+ cl_uint _stepBytes = dst.step;
+ args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
+ int _offsetXBytes = dst.offset % dst.step;
+ int _offsetX = _offsetXBytes / dst.elemSize();
+ CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
+ int _offsetY = dst.offset / dst.step;
+ int _endX = (_offsetX + dst.cols);
+ int _endY = (_offsetY + dst.rows);
+ cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
+ args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
+
+ float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+ double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+ if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
+ {
+ if (useDouble)
+ args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
+ else
+ args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
+ }
- args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&oclKernelParameter.data));
- const char* btype = NULL;
+ const char* btype = NULL;
- switch (borderType & ~BORDER_ISOLATED)
- {
- case BORDER_CONSTANT:
- btype = "BORDER_CONSTANT";
- break;
- case BORDER_REPLICATE:
- btype = "BORDER_REPLICATE";
- break;
- case BORDER_REFLECT:
- btype = "BORDER_REFLECT";
- break;
- case BORDER_WRAP:
- CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
- return;
- case BORDER_REFLECT101:
- btype = "BORDER_REFLECT_101";
- break;
- }
+ switch (borderType & ~BORDER_ISOLATED)
+ {
+ case BORDER_CONSTANT:
+ btype = "BORDER_CONSTANT";
+ break;
+ case BORDER_REPLICATE:
+ btype = "BORDER_REPLICATE";
+ break;
+ case BORDER_REFLECT:
+ btype = "BORDER_REFLECT";
+ break;
+ case BORDER_WRAP:
+ CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
+ return;
+ case BORDER_REFLECT101:
+ btype = "BORDER_REFLECT_101";
+ break;
+ }
+
+ int requiredTop = anchor.y;
+ int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
+ int requiredBottom = ksize.height - 1 - anchor.y;
+ int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
+ int h = isIsolatedBorder ? src.rows : src.wholerows;
+ int w = isIsolatedBorder ? src.cols : src.wholecols;
+ bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
+
+ char build_options[1024];
+ sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
+ "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
+ "-D %s -D %s -D %s",
+ (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
+ src.depth(), src.oclchannels(), useDouble ? 1 : 0,
+ anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
+ btype,
+ extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+ isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+
+ size_t lt[3] = {BLOCK_SIZE, 1, 1};
+ size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
+
+ cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_filter2D, "filter2D", -1, -1, build_options);
+
+ size_t kernelWorkGroupSize;
+ openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
+ CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
+ if (lt[0] > kernelWorkGroupSize)
+ {
+ clReleaseKernel(kernel);
+ CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
+ tryWorkItems = kernelWorkGroupSize;
+ continue;
+ }
- int requiredTop = anchor.y;
- int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
- int requiredBottom = ksize.height - 1 - anchor.y;
- int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
- int h = isIsolatedBorder ? src.rows : src.wholerows;
- int w = isIsolatedBorder ? src.cols : src.wholecols;
- bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
-
- char build_options[1024];
- sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d "
- "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D KERNEL_SIZE_Y2_ALIGNED=%d "
- "-D %s -D %s -D %s",
- (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
- src.depth(), src.oclchannels(), useDouble ? 1 : 0,
- anchor.x, anchor.y, ksize.width, ksize.height, kernel_size_y2_aligned,
- btype,
- extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
- isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
-
- size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1};
- openCLExecuteKernel(src.clCxt, &filtering_filter2D, "filter2D", gt, lt, args, -1, -1, build_options);
+ openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
+ } while (false);
}
Ptr<BaseFilter_GPU> cv::ocl::getLinearFilter_GPU(int /*srcType*/, int /*dstType*/, const Mat &kernel, const Size &ksize,
(src.rows == dst.rows));
CV_Assert(src.oclchannels() == dst.oclchannels());
- size_t BLOCK_SIZE = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
- size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
- while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
- BLOCK_SIZE_Y *= 2;
-
- CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
-
- bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
-
- vector<pair<size_t , const void *> > args;
-
- args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
- cl_uint stepBytes = src.step;
- args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
- int offsetXBytes = src.offset % src.step;
- int offsetX = offsetXBytes / src.elemSize();
- CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
- int offsetY = src.offset / src.step;
- int endX = (offsetX + src.cols);
- int endY = (offsetY + src.rows);
- cl_int rect[4] = {offsetX, offsetY, endX, endY};
- if (!isIsolatedBorder)
- {
- rect[2] = src.wholecols;
- rect[3] = src.wholerows;
- }
- args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
-
- args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
- cl_uint _stepBytes = dst.step;
- args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
- int _offsetXBytes = dst.offset % dst.step;
- int _offsetX = _offsetXBytes / dst.elemSize();
- CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
- int _offsetY = dst.offset / dst.step;
- int _endX = (_offsetX + dst.cols);
- int _endY = (_offsetY + dst.rows);
- cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
- args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
-
- bool useDouble = src.depth() == CV_64F;
+ size_t tryWorkItems = src.clCxt->getDeviceInfo().maxWorkItemSizes[0];
+ do {
+ size_t BLOCK_SIZE = tryWorkItems;
+ while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)src.cols * 2)
+ BLOCK_SIZE /= 2;
+ size_t BLOCK_SIZE_Y = 8; // TODO Check heuristic value on devices
+ while (BLOCK_SIZE_Y < BLOCK_SIZE / 8 && BLOCK_SIZE_Y * src.clCxt->getDeviceInfo().maxComputeUnits * 32 < (size_t)src.rows)
+ BLOCK_SIZE_Y *= 2;
+
+ CV_Assert((size_t)ksize.width <= BLOCK_SIZE);
+
+ bool isIsolatedBorder = (borderType & BORDER_ISOLATED) != 0;
+
+ vector<pair<size_t , const void *> > args;
+
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+ cl_uint stepBytes = src.step;
+ args.push_back( make_pair( sizeof(cl_uint), (void *)&stepBytes));
+ int offsetXBytes = src.offset % src.step;
+ int offsetX = offsetXBytes / src.elemSize();
+ CV_Assert((int)(offsetX * src.elemSize()) == offsetXBytes);
+ int offsetY = src.offset / src.step;
+ int endX = (offsetX + src.cols);
+ int endY = (offsetY + src.rows);
+ cl_int rect[4] = {offsetX, offsetY, endX, endY};
+ if (!isIsolatedBorder)
+ {
+ rect[2] = src.wholecols;
+ rect[3] = src.wholerows;
+ }
+ args.push_back( make_pair( sizeof(cl_int)*4, (void *)&rect[0]));
+
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
+ cl_uint _stepBytes = dst.step;
+ args.push_back( make_pair( sizeof(cl_uint), (void *)&_stepBytes));
+ int _offsetXBytes = dst.offset % dst.step;
+ int _offsetX = _offsetXBytes / dst.elemSize();
+ CV_Assert((int)(_offsetX * dst.elemSize()) == _offsetXBytes);
+ int _offsetY = dst.offset / dst.step;
+ int _endX = (_offsetX + dst.cols);
+ int _endY = (_offsetY + dst.rows);
+ cl_int _rect[4] = {_offsetX, _offsetY, _endX, _endY};
+ args.push_back( make_pair( sizeof(cl_int)*4, (void *)&_rect[0]));
+
+ bool useDouble = src.depth() == CV_64F;
+
+ float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+ double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
+ if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
+ {
+ if (useDouble)
+ args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
+ else
+ args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
+ }
- float borderValue[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
- double borderValueDouble[4] = {0, 0, 0, 0}; // DON'T move into 'if' body
- if ((borderType & ~BORDER_ISOLATED) == BORDER_CONSTANT)
- {
+ double alphaDouble = alpha; // DON'T move into 'if' body
if (useDouble)
- args.push_back( make_pair( sizeof(double) * src.oclchannels(), (void *)&borderValue[0]));
+ args.push_back( make_pair( sizeof(double), (void *)&alphaDouble));
else
- args.push_back( make_pair( sizeof(float) * src.oclchannels(), (void *)&borderValueDouble[0]));
- }
+ args.push_back( make_pair( sizeof(float), (void *)&alpha));
- double alphaDouble = alpha; // DON'T move into 'if' body
- if (useDouble)
- args.push_back( make_pair( sizeof(double), (void *)&alphaDouble));
- else
- args.push_back( make_pair( sizeof(float), (void *)&alpha));
+ const char* btype = NULL;
- const char* btype = NULL;
+ switch (borderType & ~BORDER_ISOLATED)
+ {
+ case BORDER_CONSTANT:
+ btype = "BORDER_CONSTANT";
+ break;
+ case BORDER_REPLICATE:
+ btype = "BORDER_REPLICATE";
+ break;
+ case BORDER_REFLECT:
+ btype = "BORDER_REFLECT";
+ break;
+ case BORDER_WRAP:
+ CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
+ return;
+ case BORDER_REFLECT101:
+ btype = "BORDER_REFLECT_101";
+ break;
+ }
- switch (borderType & ~BORDER_ISOLATED)
- {
- case BORDER_CONSTANT:
- btype = "BORDER_CONSTANT";
- break;
- case BORDER_REPLICATE:
- btype = "BORDER_REPLICATE";
- break;
- case BORDER_REFLECT:
- btype = "BORDER_REFLECT";
- break;
- case BORDER_WRAP:
- CV_Error(CV_StsUnsupportedFormat, "BORDER_WRAP is not supported!");
- return;
- case BORDER_REFLECT101:
- btype = "BORDER_REFLECT_101";
- break;
- }
+ int requiredTop = anchor.y;
+ int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
+ int requiredBottom = ksize.height - 1 - anchor.y;
+ int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
+ int h = isIsolatedBorder ? src.rows : src.wholerows;
+ int w = isIsolatedBorder ? src.cols : src.wholecols;
+ bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
+
+ CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well
+
+ char build_options[1024];
+ sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
+ (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
+ src.depth(), src.oclchannels(), useDouble ? 1 : 0,
+ anchor.x, anchor.y, ksize.width, ksize.height,
+ btype,
+ extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
+ isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
+
+ size_t lt[3] = {BLOCK_SIZE, 1, 1};
+ size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1};
+
+ cl_kernel kernel = openCLGetKernelFromSource(src.clCxt, &filtering_boxFilter, "boxFilter", -1, -1, build_options);
+
+ size_t kernelWorkGroupSize;
+ openCLSafeCall(clGetKernelWorkGroupInfo(kernel, getClDeviceID(src.clCxt),
+ CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
+ if (lt[0] > kernelWorkGroupSize)
+ {
+ clReleaseKernel(kernel);
+ CV_Assert(BLOCK_SIZE > kernelWorkGroupSize);
+ tryWorkItems = kernelWorkGroupSize;
+ continue;
+ }
- int requiredTop = anchor.y;
- int requiredLeft = BLOCK_SIZE; // not this: anchor.x;
- int requiredBottom = ksize.height - 1 - anchor.y;
- int requiredRight = BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
- int h = isIsolatedBorder ? src.rows : src.wholerows;
- int w = isIsolatedBorder ? src.cols : src.wholecols;
- bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
-
- CV_Assert(w >= ksize.width && h >= ksize.height); // TODO Other cases are not tested well
-
- char build_options[1024];
- sprintf(build_options, "-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d -D USE_DOUBLE=%d -D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d -D %s -D %s -D %s",
- (int)BLOCK_SIZE, (int)BLOCK_SIZE_Y,
- src.depth(), src.oclchannels(), useDouble ? 1 : 0,
- anchor.x, anchor.y, ksize.width, ksize.height,
- btype,
- extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
- isIsolatedBorder ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED");
-
- size_t gt[3] = {divUp(dst.cols, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE, divUp(dst.rows, BLOCK_SIZE_Y), 1}, lt[3] = {BLOCK_SIZE, 1, 1};
- openCLExecuteKernel(src.clCxt, &filtering_boxFilter, "boxFilter", gt, lt, args, -1, -1, build_options);
+ openCLExecuteKernel(src.clCxt, kernel, gt, lt, args); // kernel will be released here
+ } while (false);
}
Ptr<BaseFilter_GPU> cv::ocl::getBoxFilter_GPU(int /*srcType*/, int /*dstType*/,
void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype)
{
+ if (bordertype != BORDER_CONSTANT)
+ {
+ if (src.rows == 1)
+ ksize.height = 1;
+
+ if (src.cols == 1)
+ ksize.width = 1;
+ }
+
if (ksize.width == 1 && ksize.height == 1)
{
src.copyTo(dst);
dst.create(src.size(), src.type());
- if (bordertype != BORDER_CONSTANT)
- {
- if (src.rows == 1)
- ksize.height = 1;
-
- if (src.cols == 1)
- ksize.width = 1;
- }
-
Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
f->apply(src, dst);
}
args.push_back ( make_pair(sizeof(cl_int4) , (void *)&pq ));
args.push_back ( make_pair(sizeof(cl_float) , (void *)&correction ));
- const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
+ if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
+ {
+ //setup local group size
+ localThreads[0] = 8;
+ localThreads[1] = 16;
+ localThreads[2] = 1;
+
+ //init maximal number of workgroups
+ int WGNumX = 1+(sizev[0].width /(localThreads[0]));
+ int WGNumY = 1+(sizev[0].height/(localThreads[1]));
+ int WGNumZ = loopcount;
+ int WGNum = 0; //accurate number of non -empty workgroups
+ oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
+ {
+ cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
+ openCLVerifyCall(status);
+ for(int z=0;z<WGNumZ;++z)
+ {
+ int Width = (scaleinfo[z].width_height >> 16)&0xFFFF;
+ int Height = (scaleinfo[z].width_height >> 0 )& 0xFFFF;
+ for(int y=0;y<WGNumY;++y)
+ {
+ int gy = y*localThreads[1];
+ if(gy>=(Height-cascade->orig_window_size.height))
+ continue; // no data to process
+ for(int x=0;x<WGNumX;++x)
+ {
+ int gx = x*localThreads[0];
+ if(gx>=(Width-cascade->orig_window_size.width))
+ continue; // no data to process
+
+ // save no-empty workgroup info into array
+ pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
+ pWGInfo[WGNum].s[1] = (gx << 16) | gy;
+ pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
+ memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
+ WGNum++;
+ }
+ }
+ }
+ openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,pWGInfo,0,0,0));
+ pWGInfo = NULL;
+ }
- openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
+ // setup global sizes to have linear array of workgroups with WGNum size
+ globalThreads[0] = localThreads[0]*WGNum;
+ globalThreads[1] = localThreads[1];
+ globalThreads[2] = 1;
+
+#define NODE_SIZE 12
+ // pack node info to have less memory loads
+ oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
+ {
+ cl_int status;
+ cl_int* pNodesPK = (cl_int*)clEnqueueMapBuffer(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,true,CL_MAP_WRITE, 0, oclNodesPK.step, 0,0,0,&status);
+ openCLVerifyCall(status);
+ //use known local data stride to precalulate indexes
+ int DATA_SIZE_X = (localThreads[0]+cascade->orig_window_size.width);
+ // check that maximal value is less than maximal unsigned short
+ assert(DATA_SIZE_X*cascade->orig_window_size.height+cascade->orig_window_size.width < USHRT_MAX);
+ for(int i = 0;i<nodenum;++i)
+ {//process each node from classifier
+ struct NodePK
+ {
+ unsigned short slm_index[3][4];
+ float weight[3];
+ float threshold;
+ float alpha[2];
+ };
+ struct NodePK * pOut = (struct NodePK *)(pNodesPK + NODE_SIZE*i);
+ for(int k=0;k<3;++k)
+ {// calc 4 short indexes in shared local mem for each rectangle instead of 2 (x,y) pair.
+ int* p = &(node[i].p[k][0]);
+ pOut->slm_index[k][0] = (unsigned short)(p[1]*DATA_SIZE_X+p[0]);
+ pOut->slm_index[k][1] = (unsigned short)(p[1]*DATA_SIZE_X+p[2]);
+ pOut->slm_index[k][2] = (unsigned short)(p[3]*DATA_SIZE_X+p[0]);
+ pOut->slm_index[k][3] = (unsigned short)(p[3]*DATA_SIZE_X+p[2]);
+ }
+ //store used float point values for each node
+ pOut->weight[0] = node[i].weight[0];
+ pOut->weight[1] = node[i].weight[1];
+ pOut->weight[2] = node[i].weight[2];
+ pOut->threshold = node[i].threshold;
+ pOut->alpha[0] = node[i].alpha[0];
+ pOut->alpha[1] = node[i].alpha[1];
+ }
+ openCLSafeCall(clEnqueueUnmapMemObject(getClCommandQueue(oclNodesPK.clCxt),(cl_mem)oclNodesPK.datastart,pNodesPK,0,0,0));
+ pNodesPK = NULL;
+ }
+ // add 2 additional buffers (WGinfo and packed nodes) as 2 last args
+ args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclNodesPK.datastart ));
+ args.push_back ( make_pair(sizeof(cl_mem) , (void *)&oclWGInfo.datastart ));
+
+ //form build options for kernel
+ string options = "-D PACKED_CLASSIFIER";
+ options += format(" -D NODE_SIZE=%d",NODE_SIZE);
+ options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
+ options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
+ options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
+ options += format(" -D LSx=%d",localThreads[0]);
+ options += format(" -D LSy=%d",localThreads[1]);
+ options += format(" -D SPLITNODE=%d",splitnode);
+ options += format(" -D SPLITSTAGE=%d",splitstage);
+ options += format(" -D OUTPUTSZ=%d",outputsz);
+
+ // init candiate global count by 0
+ int pattern = 0;
+ openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
+ // execute face detector
+ openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
+ //read candidate buffer back and put it into host list
+ openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
+ assert(candidate[0]<outputsz);
+ //printf("candidate[0]=%d\n",candidate[0]);
+ for(int i = 1; i <= candidate[0]; i++)
+ {
+ allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],candidate[4 * i + 2], candidate[4 * i + 3]));
+ }
+ }
+ else
+ {
+ const char * build_options = gcascade->is_stump_based ? "-D STUMP_BASED=1" : "-D STUMP_BASED=0";
- openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
+ openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade", globalThreads, localThreads, args, -1, -1, build_options);
- for(int i = 0; i < outputsz; i++)
- if(candidate[4 * i + 2] != 0)
- allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
- candidate[4 * i + 2], candidate[4 * i + 3]));
+ openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
+
+ for(int i = 0; i < outputsz; i++)
+ if(candidate[4 * i + 2] != 0)
+ allCandidates.push_back(Rect(candidate[4 * i], candidate[4 * i + 1],
+ candidate[4 * i + 2], candidate[4 * i + 3]));
+ }
free(scaleinfo);
free(candidate);
if (ksize > 0)
{
- Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
- Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
+ Context* clCxt = Context::getContext();
+ if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
+ src.cols % 8 == 0 && src.rows % 8 == 0 &&
+ ksize==3 &&
+ (borderType ==cv::BORDER_REFLECT ||
+ borderType == cv::BORDER_REPLICATE ||
+ borderType ==cv::BORDER_REFLECT101 ||
+ borderType ==cv::BORDER_WRAP))
+ {
+ Dx.create(src.size(), CV_32FC1);
+ Dy.create(src.size(), CV_32FC1);
+
+ const unsigned int block_x = 8;
+ const unsigned int block_y = 8;
+
+ unsigned int src_pitch = src.step;
+ unsigned int dst_pitch = Dx.cols;
+
+ float _scale = scale;
+
+ std::vector<std::pair<size_t , const void *> > args;
+ args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
+ args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
+ args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
+ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
+ args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
+ args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
+ args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
+ args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
+ size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
+
+ string option = "-D BLK_X=8 -D BLK_Y=8";
+ switch(borderType)
+ {
+ case cv::BORDER_REPLICATE:
+ option += " -D BORDER_REPLICATE";
+ break;
+ case cv::BORDER_REFLECT:
+ option += " -D BORDER_REFLECT";
+ break;
+ case cv::BORDER_REFLECT101:
+ option += " -D BORDER_REFLECT101";
+ break;
+ case cv::BORDER_WRAP:
+ option += " -D BORDER_WRAP";
+ break;
+ }
+ openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
+ }
+ else
+ {
+ Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
+ Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
+ }
}
else
{
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step));
args.push_back( make_pair( sizeof(cl_float) , (void *)&k));
+
openCLExecuteKernel(dst.clCxt, source, kernelName, gt, lt, args, -1, -1, buildOptions.c_str());
}
{
if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
{
- CV_Error(CV_OpenCLDoubleNotSupported, "Select device doesn't support double");
+ CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
return;
}
- CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE
|| borderType == cv::BORDER_REFLECT);
+
extractCovData(src, dx, dy, blockSize, ksize, borderType);
- dst.create(src.size(), CV_32F);
+ dst.create(src.size(), CV_32FC1);
corner_ocl(&imgproc_calcHarris, "calcHarris", blockSize, static_cast<float>(k), dx, dy, dst, borderType);
}
{
if (!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.depth() == CV_64F)
{
- CV_Error(CV_OpenCLDoubleNotSupported, "select device don't support double");
+ CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
return;
}
- CV_Assert(src.cols >= blockSize / 2 && src.rows >= blockSize / 2);
- CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 || borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+ CV_Assert(borderType == cv::BORDER_CONSTANT || borderType == cv::BORDER_REFLECT101 ||
+ borderType == cv::BORDER_REPLICATE || borderType == cv::BORDER_REFLECT);
+
extractCovData(src, dx, dy, blockSize, ksize, borderType);
dst.create(src.size(), CV_32F);
float inv_window_area __attribute__((aligned (4)));
} GpuHidHaarClassifierCascade;
+
+#ifdef PACKED_CLASSIFIER
+// this code is scalar, one pixel -> one workitem
+__kernel void gpuRunHaarClassifierCascadePacked(
+ global const GpuHidHaarStageClassifier * stagecascadeptr,
+ global const int4 * info,
+ global const GpuHidHaarTreeNode * nodeptr,
+ global const int * restrict sum,
+ global const float * restrict sqsum,
+ volatile global int4 * candidate,
+ const int pixelstep,
+ const int loopcount,
+ const int start_stage,
+ const int split_stage,
+ const int end_stage,
+ const int startnode,
+ const int splitnode,
+ const int4 p,
+ const int4 pq,
+ const float correction,
+ global const int* pNodesPK,
+ global const int4* pWGInfo
+ )
+
+{
+// this version used information provided for each workgroup
+// no empty WG
+ int gid = (int)get_group_id(0);
+ int lid_x = (int)get_local_id(0);
+ int lid_y = (int)get_local_id(1);
+ int lid = lid_y*LSx+lid_x;
+ int4 WGInfo = pWGInfo[gid];
+ int GroupX = (WGInfo.y >> 16)&0xFFFF;
+ int GroupY = (WGInfo.y >> 0 )& 0xFFFF;
+ int Width = (WGInfo.x >> 16)&0xFFFF;
+ int Height = (WGInfo.x >> 0 )& 0xFFFF;
+ int ImgOffset = WGInfo.z;
+ float ScaleFactor = as_float(WGInfo.w);
+
+#define DATA_SIZE_X (LSx+WND_SIZE_X)
+#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
+#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
+
+ local int SumL[DATA_SIZE];
+
+ // read input data window into local mem
+ for(int i = 0; i<DATA_SIZE; i+=(LSx*LSy))
+ {
+ int index = i+lid; // index in shared local memory
+ if(index<DATA_SIZE)
+ {// calc global x,y coordinat and read data from there
+ int x = min(GroupX + (index % (DATA_SIZE_X)),Width-1);
+ int y = min(GroupY + (index / (DATA_SIZE_X)),Height-1);
+ SumL[index] = sum[ImgOffset+y*pixelstep+x];
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // calc variance_norm_factor for all stages
+ float variance_norm_factor;
+ int nodecounter= startnode;
+ int4 info1 = p;
+ int4 info2 = pq;
+
+ {
+ int xl = lid_x;
+ int yl = lid_y;
+ int OffsetLocal = yl * DATA_SIZE_X + xl;
+ int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl);
+
+ // add shift to get position on scaled image
+ OffsetGlobal += ImgOffset;
+
+ float mean =
+ SumL[info1.y*DATA_SIZE_X+info1.x+OffsetLocal] -
+ SumL[info1.y*DATA_SIZE_X+info1.z+OffsetLocal] -
+ SumL[info1.w*DATA_SIZE_X+info1.x+OffsetLocal] +
+ SumL[info1.w*DATA_SIZE_X+info1.z+OffsetLocal];
+ float sq =
+ sqsum[info2.y*pixelstep+info2.x+OffsetGlobal] -
+ sqsum[info2.y*pixelstep+info2.z+OffsetGlobal] -
+ sqsum[info2.w*pixelstep+info2.x+OffsetGlobal] +
+ sqsum[info2.w*pixelstep+info2.z+OffsetGlobal];
+
+ mean *= correction;
+ sq *= correction;
+
+ variance_norm_factor = sq - mean * mean;
+ variance_norm_factor = (variance_norm_factor >=0.f) ? sqrt(variance_norm_factor) : 1.f;
+ }// end calc variance_norm_factor for all stages
+
+ int result = (1.0f>0.0f);
+ for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
+ {// iterate until candidate is exist
+ float stage_sum = 0.0f;
+ int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop);
+ float stagethreshold = as_float(stageinfo.y);
+ int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
+ for(int nodeloop = 0; nodeloop < stageinfo.x; nodecounter++,nodeloop++ )
+ {
+ // simple macro to extract shorts from int
+#define M0(_t) ((_t)&0xFFFF)
+#define M1(_t) (((_t)>>16)&0xFFFF)
+ // load packed node data from global memory (L3) into registers
+ global const int4* pN = (__global int4*)(pNodesPK+nodecounter*NODE_SIZE);
+ int4 n0 = pN[0];
+ int4 n1 = pN[1];
+ int4 n2 = pN[2];
+ float nodethreshold = as_float(n2.y) * variance_norm_factor;
+ // calc sum of intensity pixels according to node information
+ float classsum =
+ (SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
+ (SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
+ (SumL[M0(n1.x)+lcl_off] - SumL[M1(n1.x)+lcl_off] - SumL[M0(n1.y)+lcl_off] + SumL[M1(n1.y)+lcl_off]) * as_float(n2.x);
+ //accumulate stage responce
+ stage_sum += (classsum >= nodethreshold) ? as_float(n2.w) : as_float(n2.z);
+ }
+ result = (stage_sum >= stagethreshold);
+ }// next stage if needed
+
+ if(result)
+ {// all stages will be passed and there is a detected face on the tested position
+ int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
+ if(index<OUTPUTSZ)
+ {
+ int x = GroupX+lid_x;
+ int y = GroupY+lid_y;
+ int4 candidate_result;
+ candidate_result.x = convert_int_rtn(x*ScaleFactor);
+ candidate_result.y = convert_int_rtn(y*ScaleFactor);
+ candidate_result.z = convert_int_rtn(ScaleFactor*WND_SIZE_X);
+ candidate_result.w = convert_int_rtn(ScaleFactor*WND_SIZE_Y);
+ candidate[index] = candidate_result;
+ }
+ }
+}//end gpuRunHaarClassifierCascade
+#else
+
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade(
global GpuHidHaarStageClassifier * stagecascadeptr,
global int4 * info,
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx)
}//end for(int scalei = 0; scalei <loopcount; scalei++)
}
+#endif
//
//M*/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
+#ifdef BORDER_CONSTANT
+#elif defined BORDER_REPLICATE
+#define EXTRAPOLATE(x, maxV) \
+ { \
+ x = max(min(x, maxV - 1), 0); \
+ }
+#elif defined BORDER_WRAP
+#define EXTRAPOLATE(x, maxV) \
+ { \
+ if (x < 0) \
+ x -= ((x - maxV + 1) / maxV) * maxV; \
+ if (x >= maxV) \
+ x %= maxV; \
+ }
+#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)
+#define EXTRAPOLATE_(x, maxV, delta) \
+ { \
+ if (maxV == 1) \
+ x = 0; \
+ else \
+ do \
+ { \
+ if ( x < 0 ) \
+ x = -x - 1 + delta; \
+ else \
+ x = maxV - 1 - (x - maxV) - delta; \
+ } \
+ while (x >= maxV || x < 0); \
+ }
#ifdef BORDER_REFLECT
-//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT101
-//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
+#else
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
#endif
-
-#ifdef BORDER_WRAP
-//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#else
+#error No extrapolation method
#endif
#define THREADS 256
-#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////calcHarris////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst,
- int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
- int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step,
- float k)
+
+__kernel void calcHarris(__global const float *Dx, __global const float *Dy, __global float *dst,
+ int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
+ int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
+ int dst_offset, int dst_rows, int dst_cols, int dst_step, float k)
{
int col = get_local_id(0);
- const int gX = get_group_id(0);
- const int gY = get_group_id(1);
- const int glx = get_global_id(0);
- const int gly = get_global_id(1);
+ int gX = get_group_id(0);
+ int gY = get_group_id(1);
+ int glx = get_global_id(0);
+ int gly = get_global_id(1);
int dx_x_off = (dx_offset % dx_step) >> 2;
int dx_y_off = dx_offset / dx_step;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
- float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
+ float dx_data[ksY+1],dy_data[ksY+1], data[3][ksY+1];
__local float temp[6][THREADS];
+
#ifdef BORDER_CONSTANT
bool dx_con,dy_con;
- float dx_s,dy_s;
- for(int i=0; i < ksY+1; i++)
+ float dx_s, dy_s;
+ for (int i=0; i < ksY+1; i++)
{
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
dx_data[i] = dx_con ? dx_s : 0.0;
+
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows;
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)];
dy_data[i] = dy_con ? dy_s : 0.0;
+
data[0][i] = dx_data[i] * dx_data[i];
data[1][i] = dx_data[i] * dy_data[i];
data[2][i] = dy_data[i] * dy_data[i];
}
#else
int clamped_col = min(dst_cols, col);
- for(int i=0; i < ksY+1; i++)
+ for (int i=0; i < ksY+1; i++)
{
- int dx_selected_row;
- int dx_selected_col;
- dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
- dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
- dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
- dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
+ int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;
+ EXTRAPOLATE(dx_selected_row, dx_whole_rows)
+ EXTRAPOLATE(dx_selected_col, dx_whole_cols)
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
- int dy_selected_row;
- int dy_selected_col;
- dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
- dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
- dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
- dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
+ int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;
+ EXTRAPOLATE(dy_selected_row, dy_whole_rows)
+ EXTRAPOLATE(dy_selected_col, dy_whole_cols)
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
data[0][i] = dx_data[i] * dx_data[i];
}
#endif
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
- for(int i=1; i < ksY; i++)
+ for (int i=1; i < ksY; i++)
{
- sum0 += (data[0][i]);
- sum1 += (data[1][i]);
- sum2 += (data[2][i]);
+ sum0 += data[0][i];
+ sum1 += data[1][i];
+ sum2 += data[2][i];
}
- float sum01,sum02,sum11,sum12,sum21,sum22;
- sum01 = sum0 + (data[0][0]);
- sum02 = sum0 + (data[0][ksY]);
+
+ float sum01 = sum0 + data[0][0];
+ float sum02 = sum0 + data[0][ksY];
temp[0][col] = sum01;
temp[1][col] = sum02;
- sum11 = sum1 + (data[1][0]);
- sum12 = sum1 + (data[1][ksY]);
+ float sum11 = sum1 + data[1][0];
+ float sum12 = sum1 + data[1][ksY];
temp[2][col] = sum11;
temp[3][col] = sum12;
- sum21 = sum2 + (data[2][0]);
- sum22 = sum2 + (data[2][ksY]);
+ float sum21 = sum2 + data[2][0];
+ float sum22 = sum2 + data[2][ksY];
temp[4][col] = sum21;
temp[5][col] = sum22;
barrier(CLK_LOCAL_MEM_FENCE);
- if(col < (THREADS-(ksX-1)))
+
+ if (col < (THREADS- (ksX - 1)))
{
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gly << 1);
int till = (ksX + 1)%2;
- float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
- for(int k=0; k<6; k++)
- for(int i=-anX; i<=anX - till; i++)
- {
+ float tmp_sum[6] = { 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
+ for (int k=0; k<6; k++)
+ for (int i=-anX; i<=anX - till; i++)
tmp_sum[k] += temp[k][col+i];
- }
- if(posX < dst_cols && (posY) < dst_rows)
+ if (posX < dst_cols && (posY) < dst_rows)
{
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] =
tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]);
}
- if(posX < dst_cols && (posY + 1) < dst_rows)
+ if (posX < dst_cols && (posY + 1) < dst_rows)
{
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] =
tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]);
//
//M*/
-#if defined (DOUBLE_SUPPORT)
-#pragma OPENCL EXTENSION cl_khr_fp64:enable
-#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
-#ifdef BORDER_REPLICATE
-//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
-#endif
+#ifdef BORDER_CONSTANT
+#elif defined BORDER_REPLICATE
+#define EXTRAPOLATE(x, maxV) \
+ { \
+ x = max(min(x, maxV - 1), 0); \
+ }
+#elif defined BORDER_WRAP
+#define EXTRAPOLATE(x, maxV) \
+ { \
+ if (x < 0) \
+ x -= ((x - maxV + 1) / maxV) * maxV; \
+ if (x >= maxV) \
+ x %= maxV; \
+ }
+#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT101)
+#define EXTRAPOLATE_(x, maxV, delta) \
+ { \
+ if (maxV == 1) \
+ x = 0; \
+ else \
+ do \
+ { \
+ if ( x < 0 ) \
+ x = -x - 1 + delta; \
+ else \
+ x = maxV - 1 - (x - maxV) - delta; \
+ } \
+ while (x >= maxV || x < 0); \
+ }
#ifdef BORDER_REFLECT
-//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
-#endif
-
-#ifdef BORDER_REFLECT101
-//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 0)
+#else
+#define EXTRAPOLATE(x, maxV) EXTRAPOLATE_(x, maxV, 1)
#endif
-
-#ifdef BORDER_WRAP
-//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
-#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
-#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
-#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
-#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#else
+#error No extrapolation method
#endif
#define THREADS 256
-#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2)
+
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////calcHarris////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst,
int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step,
int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step,
- int dst_offset, int dst_rows, int dst_cols, int dst_step,
- float k)
+ int dst_offset, int dst_rows, int dst_cols, int dst_step, float k)
{
int col = get_local_id(0);
- const int gX = get_group_id(0);
- const int gY = get_group_id(1);
- const int glx = get_global_id(0);
- const int gly = get_global_id(1);
+ int gX = get_group_id(0);
+ int gY = get_group_id(1);
+ int glx = get_global_id(0);
+ int gly = get_global_id(1);
int dx_x_off = (dx_offset % dx_step) >> 2;
int dx_y_off = dx_offset / dx_step;
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off;
int dst_startY = (gY << 1) + dst_y_off;
- float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1];
+ float dx_data[ksY+1], dy_data[ksY+1], data[3][ksY+1];
__local float temp[6][THREADS];
+
#ifdef BORDER_CONSTANT
- bool dx_con,dy_con;
- float dx_s,dy_s;
- for(int i=0; i < ksY+1; i++)
+ bool dx_con, dy_con;
+ float dx_s, dy_s;
+ for (int i=0; i < ksY+1; i++)
{
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows;
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)];
}
#else
int clamped_col = min(dst_cols, col);
-
- for(int i=0; i < ksY+1; i++)
+ for (int i=0; i < ksY+1; i++)
{
- int dx_selected_row;
- int dx_selected_col;
- dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows);
- dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row);
- dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols);
- dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col);
+ int dx_selected_row = dx_startY+i, dx_selected_col = dx_startX+clamped_col;
+ EXTRAPOLATE(dx_selected_row, dx_whole_rows)
+ EXTRAPOLATE(dx_selected_col, dx_whole_cols)
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col];
- int dy_selected_row;
- int dy_selected_col;
- dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows);
- dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row);
- dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols);
- dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col);
+ int dy_selected_row = dy_startY+i, dy_selected_col = dy_startX+clamped_col;
+ EXTRAPOLATE(dy_selected_row, dy_whole_rows)
+ EXTRAPOLATE(dy_selected_col, dy_whole_cols)
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col];
data[0][i] = dx_data[i] * dx_data[i];
}
#endif
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0;
- for(int i=1; i < ksY; i++)
+ for (int i=1; i < ksY; i++)
{
sum0 += (data[0][i]);
sum1 += (data[1][i]);
sum2 += (data[2][i]);
}
- float sum01,sum02,sum11,sum12,sum21,sum22;
- sum01 = sum0 + (data[0][0]);
- sum02 = sum0 + (data[0][ksY]);
+
+ float sum01 = sum0 + (data[0][0]);
+ float sum02 = sum0 + (data[0][ksY]);
temp[0][col] = sum01;
temp[1][col] = sum02;
- sum11 = sum1 + (data[1][0]);
- sum12 = sum1 + (data[1][ksY]);
+ float sum11 = sum1 + (data[1][0]);
+ float sum12 = sum1 + (data[1][ksY]);
temp[2][col] = sum11;
temp[3][col] = sum12;
- sum21 = sum2 + (data[2][0]);
- sum22 = sum2 + (data[2][ksY]);
+ float sum21 = sum2 + (data[2][0]);
+ float sum22 = sum2 + (data[2][ksY]);
temp[4][col] = sum21;
temp[5][col] = sum22;
barrier(CLK_LOCAL_MEM_FENCE);
+
if(col < (THREADS-(ksX-1)))
{
col += anX;
int posX = dst_startX - dst_x_off + col - anX;
int posY = (gly << 1);
int till = (ksX + 1)%2;
- float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
- for(int k=0; k<6; k++)
- for(int i=-anX; i<=anX - till; i++)
- {
+ float tmp_sum[6] = { 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 };
+ for (int k=0; k<6; k++)
+ for (int i=-anX; i<=anX - till; i++)
tmp_sum[k] += temp[k][col+i];
- }
if(posX < dst_cols && (posY) < dst_rows)
{
float c = tmp_sum[4] * 0.5f;
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b));
}
- if(posX < dst_cols && (posY + 1) < dst_rows)
+ if (posX < dst_cols && (posY + 1) < dst_rows)
{
float a = tmp_sum[1] * 0.5f;
float b = tmp_sum[3];
--- /dev/null
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////Macro for border type////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef BORDER_REPLICATE
+//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
+#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
+#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
+#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
+#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
+#endif
+
+#ifdef BORDER_REFLECT
+//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
+#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
+#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
+#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_REFLECT101
+//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
+#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
+#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
+#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
+#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
+#endif
+
+#ifdef BORDER_WRAP
+//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
+#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
+#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
+#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
+#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
+#endif
+
+__kernel void sobel3(
+ __global uchar* Src,
+ __global float* DstX,
+ __global float* DstY,
+ int width, int height,
+ uint srcStride, uint dstStride,
+ float scale
+ )
+{
+ __local float lsmem[BLK_Y+2][BLK_X+2];
+
+ int lix = get_local_id(0);
+ int liy = get_local_id(1);
+
+ int gix = get_group_id(0);
+ int giy = get_group_id(1);
+
+ int id_x = get_global_id(0);
+ int id_y = get_global_id(1);
+
+ lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]);
+
+ int id_y_h = ADDR_H(id_y-1, 0,height);
+ int id_y_b = ADDR_B(id_y+1, height,id_y+1);
+
+ int id_x_l = ADDR_L(id_x-1, 0,width);
+ int id_x_r = ADDR_R(id_x+1, width,id_x+1);
+
+ if(liy==0)
+ {
+ lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]);
+
+ if(lix==0)
+ lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]);
+ else if(lix==BLK_X-1)
+ lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]);
+ }
+ else if(liy==BLK_Y-1)
+ {
+ lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]);
+
+ if(lix==0)
+ lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]);
+ else if(lix==BLK_X-1)
+ lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]);
+ }
+
+ if(lix==0)
+ lsmem[liy+1][0] = convert_float(Src[ id_y * srcStride + id_x_l ]);
+ else if(lix==BLK_X-1)
+ lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]);
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ float u1 = lsmem[liy][lix];
+ float u2 = lsmem[liy][lix+1];
+ float u3 = lsmem[liy][lix+2];
+
+ float m1 = lsmem[liy+1][lix];
+ float m2 = lsmem[liy+1][lix+1];
+ float m3 = lsmem[liy+1][lix+2];
+
+ float b1 = lsmem[liy+2][lix];
+ float b2 = lsmem[liy+2][lix+1];
+ float b3 = lsmem[liy+2][lix+2];
+
+ //m2 * scale;//
+ float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 );
+ DstX[ id_y * dstStride + id_x ] = dx * scale;
+
+ float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3);
+ DstY[ id_y * dstStride + id_x ] = dy * scale;
+}
\ No newline at end of file
// License Agreement
// For Open Source Computer Vision Library
//
-// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
-// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
-// @Authors
-// Jia Haipeng, jiahaipeng95@gmail.com
-//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
-///////////////////////////////////////////////////////////////////////////////////////////////
-//////////////////////////////////optimized code using vector ////////////////////////////////
-////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
-////////////////////////////////////////////////////////////////////////////////////////////////
-__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- __global uchar *mat_dst2, int dst2_step, int dst2_offset,
- __global uchar *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
-
- int dst3_start = mad24(y, dst3_step, dst3_offset);
- int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
- int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
-
- uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
- uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
- uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
- uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
-
- int total_bytes = src_offset + rows * src_step;
- uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
- uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
- uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
-
- uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
-
- if((dst0_offset & 3) == 3)
- tmp_data0 = (uchar4)(data_0.x, data_1.x, data_2.x, data_3.x);
- if((dst0_offset & 3) == 2)
- tmp_data0 = (uchar4)(data_1.x, data_2.x, data_3.x, data_4.x);
- if((dst0_offset & 3) == 1)
- tmp_data0 = (uchar4)(data_2.x, data_3.x, data_4.x, data_5.x);
- if((dst0_offset & 3) == 0)
- tmp_data0 = (uchar4)(data_3.x, data_4.x, data_5.x, data_6.x);
-
- if((dst1_offset & 3) == 3)
- tmp_data1 = (uchar4)(data_0.y, data_1.y, data_2.y, data_3.y);
- if((dst1_offset & 3) == 2)
- tmp_data1 = (uchar4)(data_1.y, data_2.y, data_3.y, data_4.y);
- if((dst1_offset & 3) == 1)
- tmp_data1 = (uchar4)(data_2.y, data_3.y, data_4.y, data_5.y);
- if((dst1_offset & 3) == 0)
- tmp_data1 = (uchar4)(data_3.y, data_4.y, data_5.y, data_6.y);
-
- if((dst2_offset & 3) == 3)
- tmp_data2 = (uchar4)(data_0.z, data_1.z, data_2.z, data_3.z);
- if((dst2_offset & 3) == 2)
- tmp_data2 = (uchar4)(data_1.z, data_2.z, data_3.z, data_4.z);
- if((dst2_offset & 3) == 1)
- tmp_data2 = (uchar4)(data_2.z, data_3.z, data_4.z, data_5.z);
- if((dst2_offset & 3) == 0)
- tmp_data2 = (uchar4)(data_3.z, data_4.z, data_5.z, data_6.z);
-
- if((dst3_offset & 3) == 3)
- tmp_data3 = (uchar4)(data_0.w, data_1.w, data_2.w, data_3.w);
- if((dst3_offset & 3) == 2)
- tmp_data3 = (uchar4)(data_1.w, data_2.w, data_3.w, data_4.w);
- if((dst3_offset & 3) == 1)
- tmp_data3 = (uchar4)(data_2.w, data_3.w, data_4.w, data_5.w);
- if((dst3_offset & 3) == 0)
- tmp_data3 = (uchar4)(data_3.w, data_4.w, data_5.w, data_6.w);
-
- uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
- uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
- uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
- uchar4 dst3_data = *((__global uchar4 *)(mat_dst3 + dst3_idx));
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
- tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
- tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
- tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
- tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
- tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
- tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
-
- *((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
- *((__global uchar4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
- }
-}
-
-__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- __global uchar *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- int src_idx = mad24(y, src_step, src_offset);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
- uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
- uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
- uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
-
- uchar4 tmp_data0, tmp_data1, tmp_data2;
-
- uchar src_data_0 = *(mat_src + src_idx + 3 * x - 9);
- uchar src_data_1 = *(mat_src + src_idx + 3 * x - 8);
- uchar src_data_2 = *(mat_src + src_idx + 3 * x - 7);
-
- uchar src_data_3 = *(mat_src + src_idx + 3 * x - 6);
- uchar src_data_4 = *(mat_src + src_idx + 3 * x - 5);
- uchar src_data_5 = *(mat_src + src_idx + 3 * x - 4);
-
- uchar src_data_6 = *(mat_src + src_idx + 3 * x - 3);
- uchar src_data_7 = *(mat_src + src_idx + 3 * x - 2);
- uchar src_data_8 = *(mat_src + src_idx + 3 * x - 1);
-
- uchar src_data_9 = *(mat_src + src_idx + 3 * x + 0);
- uchar src_data_10 = *(mat_src + src_idx + 3 * x + 1);
- uchar src_data_11 = *(mat_src + src_idx + 3 * x + 2);
-
- uchar src_data_12 = *(mat_src + src_idx + 3 * x + 3);
- uchar src_data_13 = *(mat_src + src_idx + 3 * x + 4);
- uchar src_data_14 = *(mat_src + src_idx + 3 * x + 5);
-
- uchar src_data_15 = *(mat_src + src_idx + 3 * x + 6);
- uchar src_data_16 = *(mat_src + src_idx + 3 * x + 7);
- uchar src_data_17 = *(mat_src + src_idx + 3 * x + 8);
-
- uchar src_data_18 = *(mat_src + src_idx + 3 * x + 9);
- uchar src_data_19 = *(mat_src + src_idx + 3 * x + 10);
- uchar src_data_20 = *(mat_src + src_idx + 3 * x + 11);
-
- uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
- int index = 3 - dst0_offset & 3;
- tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
-
- uchar4 data0, data1, data2;
-
- data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
- data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
- data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
- tmp_data1 = (dst1_offset & 3) == 0 ? (uchar4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
-
- data0 = (uchar4)(src_data_2, src_data_5, src_data_8, src_data_11);
- data1 = (dst2_offset & 3) == 2 ? (uchar4)(src_data_5, src_data_8, src_data_11, src_data_14) : data0;
- data2 = (dst2_offset & 3) == 1 ? (uchar4)(src_data_8, src_data_11, src_data_14, src_data_17) : data1;
- tmp_data2 = (dst2_offset & 3) == 0 ? (uchar4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
- tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
- tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
- *((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global uchar4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
- }
-}
-
-__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
- __global uchar *mat_dst0, int dst0_step, int dst0_offset,
- __global uchar *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- #define dst0_align ((dst0_offset & 3) << 1)
- #define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
- uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
- uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
- if(src_idx_0 == -6)
- src_data_0.s01234567 = src_data_0.s67012345;
- if(src_idx_0 == -4)
- src_data_0.s01234567 = src_data_0.s45670123;
- if(src_idx_0 == -2)
- src_data_0.s01234567 = src_data_0.s23456701;
- if(src_idx_1 == -6)
- src_data_1.s01234567 = src_data_1.s67012345;
- if(src_idx_1 == -4)
- src_data_1.s01234567 = src_data_1.s45670123;
- if(src_idx_1 == -2)
- src_data_1.s01234567 = src_data_1.s23456701;
-
- uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
- uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
-
- uchar4 tmp_data0, tmp_data1;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
-
- *((__global uchar4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global uchar4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- }
-}
-
-__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- __global char *mat_dst2, int dst2_step, int dst2_offset,
- __global char *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- int src_idx = mad24(y, src_step, src_offset + (x << 2));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
- int dst3_start = mad24(y, dst3_step, dst3_offset);
- int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
- int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
-
- char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
- char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
- char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
- char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
- char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
- char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
- char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
-
- char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
-
- if((dst0_offset & 3) == 3)
- tmp_data0 = (char4)(data_0.x, data_1.x, data_2.x, data_3.x);
- if((dst0_offset & 3) == 2)
- tmp_data0 = (char4)(data_1.x, data_2.x, data_3.x, data_4.x);
- if((dst0_offset & 3) == 1)
- tmp_data0 = (char4)(data_2.x, data_3.x, data_4.x, data_5.x);
- if((dst0_offset & 3) == 0)
- tmp_data0 = (char4)(data_3.x, data_4.x, data_5.x, data_6.x);
-
- if((dst1_offset & 3) == 3)
- tmp_data1 = (char4)(data_0.y, data_1.y, data_2.y, data_3.y);
- if((dst1_offset & 3) == 2)
- tmp_data1 = (char4)(data_1.y, data_2.y, data_3.y, data_4.y);
- if((dst1_offset & 3) == 1)
- tmp_data1 = (char4)(data_2.y, data_3.y, data_4.y, data_5.y);
- if((dst1_offset & 3) == 0)
- tmp_data1 = (char4)(data_3.y, data_4.y, data_5.y, data_6.y);
-
- if((dst2_offset & 3) == 3)
- tmp_data2 = (char4)(data_0.z, data_1.z, data_2.z, data_3.z);
- if((dst2_offset & 3) == 2)
- tmp_data2 = (char4)(data_1.z, data_2.z, data_3.z, data_4.z);
- if((dst2_offset & 3) == 1)
- tmp_data2 = (char4)(data_2.z, data_3.z, data_4.z, data_5.z);
- if((dst2_offset & 3) == 0)
- tmp_data2 = (char4)(data_3.z, data_4.z, data_5.z, data_6.z);
-
- if((dst3_offset & 3) == 3)
- tmp_data3 = (char4)(data_0.w, data_1.w, data_2.w, data_3.w);
- if((dst3_offset & 3) == 2)
- tmp_data3 = (char4)(data_1.w, data_2.w, data_3.w, data_4.w);
- if((dst3_offset & 3) == 1)
- tmp_data3 = (char4)(data_2.w, data_3.w, data_4.w, data_5.w);
- if((dst3_offset & 3) == 0)
- tmp_data3 = (char4)(data_3.w, data_4.w, data_5.w, data_6.w);
-
- char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
- char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
- char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
- char4 dst3_data = *((__global char4 *)(mat_dst3 + dst3_idx));
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
- tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
- tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
- tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
- tmp_data3.y = ((dst3_idx + 1 >= dst3_start) && (dst3_idx + 1 < dst3_end)) ? tmp_data3.y : dst3_data.y;
- tmp_data3.z = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.z : dst3_data.z;
- tmp_data3.w = ((dst3_idx + 3 >= dst3_start) && (dst3_idx + 3 < dst3_end)) ? tmp_data3.w : dst3_data.w;
-
- *((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
- *((__global char4 *)(mat_dst3 + dst3_idx)) = tmp_data3;
- }
-}
-
-__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- __global char *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- int src_idx = mad24(y, src_step, src_offset);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
-
- char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
- char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
- char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
-
- char4 tmp_data0, tmp_data1, tmp_data2;
-
- char src_data_0 = *(mat_src + src_idx + 3 * x - 9);
- char src_data_1 = *(mat_src + src_idx + 3 * x - 8);
- char src_data_2 = *(mat_src + src_idx + 3 * x - 7);
-
- char src_data_3 = *(mat_src + src_idx + 3 * x - 6);
- char src_data_4 = *(mat_src + src_idx + 3 * x - 5);
- char src_data_5 = *(mat_src + src_idx + 3 * x - 4);
-
- char src_data_6 = *(mat_src + src_idx + 3 * x - 3);
- char src_data_7 = *(mat_src + src_idx + 3 * x - 2);
- char src_data_8 = *(mat_src + src_idx + 3 * x - 1);
-
- char src_data_9 = *(mat_src + src_idx + 3 * x + 0);
- char src_data_10 = *(mat_src + src_idx + 3 * x + 1);
- char src_data_11 = *(mat_src + src_idx + 3 * x + 2);
-
- char src_data_12 = *(mat_src + src_idx + 3 * x + 3);
- char src_data_13 = *(mat_src + src_idx + 3 * x + 4);
- char src_data_14 = *(mat_src + src_idx + 3 * x + 5);
-
- char src_data_15 = *(mat_src + src_idx + 3 * x + 6);
- char src_data_16 = *(mat_src + src_idx + 3 * x + 7);
- char src_data_17 = *(mat_src + src_idx + 3 * x + 8);
-
- char src_data_18 = *(mat_src + src_idx + 3 * x + 9);
- char src_data_19 = *(mat_src + src_idx + 3 * x + 10);
- char src_data_20 = *(mat_src + src_idx + 3 * x + 11);
-
- char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
- int index = 3 - dst0_offset & 3;
- tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
-
- char4 data0, data1, data2;
-
- data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
- data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
- data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
- tmp_data1 = (dst1_offset & 3) == 0 ? (char4)(src_data_10, src_data_13, src_data_16, src_data_19): data2;
-
- data0 = (char4)(src_data_2, src_data_5, src_data_8, src_data_11);
- data1 = (dst2_offset & 3) == 2 ? (char4)(src_data_5, src_data_8, src_data_11, src_data_14) : data0;
- data2 = (dst2_offset & 3) == 1 ? (char4)(src_data_8, src_data_11, src_data_14, src_data_17) : data1;
- tmp_data2 = (dst2_offset & 3) == 0 ? (char4)(src_data_11, src_data_14, src_data_17, src_data_20) : data2;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? tmp_data0.y : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.z : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? tmp_data0.w : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? tmp_data1.y : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.z : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? tmp_data1.w : dst1_data.w;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 1 >= dst2_start) && (dst2_idx + 1 < dst2_end)) ? tmp_data2.y : dst2_data.y;
- tmp_data2.z = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.z : dst2_data.z;
- tmp_data2.w = ((dst2_idx + 3 >= dst2_start) && (dst2_idx + 3 < dst2_end)) ? tmp_data2.w : dst2_data.w;
-
- *((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global char4 *)(mat_dst2 + dst2_idx)) = tmp_data2;
- }
-}
-
-__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
- __global char *mat_dst0, int dst0_step, int dst0_offset,
- __global char *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 2;
-
- #define dst0_align ((dst0_offset & 3) << 1)
- #define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
- char8 src_data_0 = vload8(0, mat_src + src_idx_0);
- char8 src_data_1 = vload8(0, mat_src + src_idx_1);
- if(src_idx_0 == -6)
- src_data_0.s01234567 = src_data_0.s67012345;
- if(src_idx_0 == -4)
- src_data_0.s01234567 = src_data_0.s45670123;
- if(src_idx_0 == -2)
- src_data_0.s01234567 = src_data_0.s23456701;
- if(src_idx_1 == -6)
- src_data_1.s01234567 = src_data_1.s67012345;
- if(src_idx_1 == -4)
- src_data_1.s01234567 = src_data_1.s45670123;
- if(src_idx_1 == -2)
- src_data_1.s01234567 = src_data_1.s23456701;
- char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
- char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
-
- char4 tmp_data0, tmp_data1;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.s0 : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 1 >= dst0_start) && (dst0_idx + 1 < dst0_end)) ? src_data_0.s2 : dst0_data.y;
- tmp_data0.z = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.s4 : dst0_data.z;
- tmp_data0.w = ((dst0_idx + 3 >= dst0_start) && (dst0_idx + 3 < dst0_end)) ? src_data_0.s6 : dst0_data.w;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.s1 : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 1 >= dst1_start) && (dst1_idx + 1 < dst1_end)) ? src_data_1.s3 : dst1_data.y;
- tmp_data1.z = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.s5 : dst1_data.z;
- tmp_data1.w = ((dst1_idx + 3 >= dst1_start) && (dst1_idx + 3 < dst1_end)) ? src_data_1.s7 : dst1_data.w;
-
- *((__global char4 *)(mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global char4 *)(mat_dst1 + dst1_idx)) = tmp_data1;
- }
-}
-
-__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- __global ushort *mat_dst2, int dst2_step, int dst2_offset,
- __global ushort *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
- int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
- int dst3_start = mad24(y, dst3_step, dst3_offset);
- int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
- int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
-
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
- if(src_idx_0 == -6)
- src_data0.s01234567 = src_data0.s67012345;
- if(src_idx_0 == -4)
- src_data0.s01234567 = src_data0.s45670123;
- if(src_idx_0 == -2)
- src_data0.s01234567 = src_data0.s23456701;
- ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));
-
- ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
- ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
- ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
- ushort2 dst3_data = *((__global ushort2 *)((__global char *)mat_dst3 + dst3_idx));
-
- ushort2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
-
- tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data0.s4, src_data1.s0) : (ushort2)(src_data0.s0, src_data0.s4);
- tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data0.s5, src_data1.s1) : (ushort2)(src_data0.s1, src_data0.s5);
- tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data0.s6, src_data1.s2) : (ushort2)(src_data0.s2, src_data0.s6);
- tmp_data3 = (dst3_offset & 3) == 0 ? (ushort2)(src_data0.s7, src_data1.s3) : (ushort2)(src_data0.s3, src_data0.s7);
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
- tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
- tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
-
- *((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- *((global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
- *((global ushort2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
- }
-}
-
-__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- __global ushort *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- int src_idx = mad24(y, src_step, src_offset);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
- ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
- ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
- ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
-
- ushort2 tmp_data0, tmp_data1, tmp_data2;
-
- ushort src_data_0 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 3];
- ushort src_data_1 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 2];
- ushort src_data_2 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x - 1];
- ushort src_data_3 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 0];
- ushort src_data_4 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 1];
- ushort src_data_5 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 2];
- ushort src_data_6 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 3];
- ushort src_data_7 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 4];
- ushort src_data_8 = ((__global ushort *)((__global char *)mat_src + src_idx))[3 * x + 5];
-
- tmp_data0 = (dst0_offset & 3) == 0 ? (ushort2)(src_data_3, src_data_6) : (ushort2)(src_data_0, src_data_3);
- tmp_data1 = (dst1_offset & 3) == 0 ? (ushort2)(src_data_4, src_data_7) : (ushort2)(src_data_1, src_data_4);
- tmp_data2 = (dst2_offset & 3) == 0 ? (ushort2)(src_data_5, src_data_8) : (ushort2)(src_data_2, src_data_5);
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
- *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
- }
-}
-
-__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
- __global ushort *mat_dst0, int dst0_step, int dst0_offset,
- __global ushort *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- #define dst0_align ((dst0_offset & 3) << 1)
- #define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
- ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
- ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
- if(src_idx_0 < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
- src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
- }
- if(src_idx_1 < 0)
- {
- ushort4 tmp;
- tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
- src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
- }
-
- ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
- ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
-
- ushort2 tmp_data0, tmp_data1;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
-
- *((global ushort2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((global ushort2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- }
-}
-__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- __global short *mat_dst2, int dst2_step, int dst2_offset,
- __global short *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
- int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
-
- int dst3_start = mad24(y, dst3_step, dst3_offset);
- int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
- int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
-
- if(src_idx_0 == -6)
- src_data0.s01234567 = src_data0.s67012345;
- if(src_idx_0 == -4)
- src_data0.s01234567 = src_data0.s45670123;
- if(src_idx_0 == -2)
- src_data0.s01234567 = src_data0.s23456701;
-
- short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
-
- short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
- short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
- short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
- short2 dst3_data = *((__global short2 *)((__global char *)mat_dst3 + dst3_idx));
-
- short2 tmp_data0, tmp_data1, tmp_data2, tmp_data3;
-
- tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data0.s4, src_data1.s0) : (short2)(src_data0.s0, src_data0.s4);
- tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data0.s5, src_data1.s1) : (short2)(src_data0.s1, src_data0.s5);
- tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data0.s6, src_data1.s2) : (short2)(src_data0.s2, src_data0.s6);
- tmp_data3 = (dst3_offset & 3) == 0 ? (short2)(src_data0.s7, src_data1.s3) : (short2)(src_data0.s3, src_data0.s7);
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
-
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
-
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
- tmp_data3.x = ((dst3_idx + 0 >= dst3_start) && (dst3_idx + 0 < dst3_end)) ? tmp_data3.x : dst3_data.x;
- tmp_data3.y = ((dst3_idx + 2 >= dst3_start) && (dst3_idx + 2 < dst3_end)) ? tmp_data3.y : dst3_data.y;
-
- *((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- *((global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
- *((global short2 *)((__global char *)mat_dst3 + dst3_idx)) = tmp_data3;
- }
-}
-__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- __global short *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- x = x << 1;
-
- int src_idx = mad24(y, src_step, src_offset);
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
-
- int dst2_start = mad24(y, dst2_step, dst2_offset);
- int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
- int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
+#if DATA_DEPTH == 0
+#define BASE_TYPE uchar
+#elif DATA_DEPTH == 1
+#error data_depth char, use uchar datatype instead
+#elif DATA_DEPTH == 2
+#define BASE_TYPE ushort
+#elif DATA_DEPTH == 3
+#error data_depth short, use ushort datatype instead
+#elif DATA_DEPTH == 4
+#define BASE_TYPE int
+#elif DATA_DEPTH == 5
+#define BASE_TYPE float
+#elif DATA_DEPTH == 6
+#define BASE_TYPE double
+#else
+#error data_depth
+#endif
- short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
- short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
- short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
+#if DATA_CHAN == 2
+#define SRC_VEC_SIZE 2
+#elif DATA_CHAN == 3
+#define SRC_VEC_SIZE 4 // C3 is stored as C4
+#elif DATA_CHAN == 4
+#define SRC_VEC_SIZE 4
+#else
+#error data_chan
+#endif
- short2 tmp_data0, tmp_data1, tmp_data2;
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
- short src_data_0 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 3];
- short src_data_1 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 2];
- short src_data_2 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x - 1];
- short src_data_3 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 0];
- short src_data_4 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 1];
- short src_data_5 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 2];
- short src_data_6 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 3];
- short src_data_7 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 4];
- short src_data_8 = ((__global short *)((__global char *)mat_src + src_idx))[3 * x + 5];
+#define uchar1 uchar
+#define char1 char
+#define ushort1 ushort
+#define short1 short
+#define int1 int
+#define float1 float
+#define double1 double
- tmp_data0 = (dst0_offset & 3) == 0 ? (short2)(src_data_3, src_data_6) : (short2)(src_data_0, src_data_3);
- tmp_data1 = (dst1_offset & 3) == 0 ? (short2)(src_data_4, src_data_7) : (short2)(src_data_1, src_data_4);
- tmp_data2 = (dst2_offset & 3) == 0 ? (short2)(src_data_5, src_data_8) : (short2)(src_data_2, src_data_5);
+#define TYPE BASE_TYPE
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? tmp_data0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? tmp_data0.y : dst0_data.y;
+#define SRC_TYPE CAT(BASE_TYPE, SRC_VEC_SIZE)
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? tmp_data1.x : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? tmp_data1.y : dst1_data.y;
+#define DST_VEC_TYPE CAT(BASE_TYPE, VEC_SIZE)
- tmp_data2.x = ((dst2_idx + 0 >= dst2_start) && (dst2_idx + 0 < dst2_end)) ? tmp_data2.x : dst2_data.x;
- tmp_data2.y = ((dst2_idx + 2 >= dst2_start) && (dst2_idx + 2 < dst2_end)) ? tmp_data2.y : dst2_data.y;
-
- *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((__global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- *((__global short2 *)((__global char *)mat_dst2 + dst2_idx)) = tmp_data2;
- }
-}
+#define vstore1 vstore
+#define VSTORE CAT(vstore, VEC_SIZE)
+#define VSTORE_ALIGNED(ptr, v) *((__global DST_VEC_TYPE*)(ptr)) = (v)
+#define VSTORE_UNALIGNED(ptr, v) VSTORE((v), 0, (__global TYPE*)(ptr))
+#ifdef DST0_ALIGNED
+#define VSTORE_dst0 VSTORE_ALIGNED
+#else
+#define VSTORE_dst0 VSTORE_UNALIGNED
+#endif
+#ifdef DST1_ALIGNED
+#define VSTORE_dst1 VSTORE_ALIGNED
+#else
+#define VSTORE_dst1 VSTORE_UNALIGNED
+#endif
+#ifdef DST2_ALIGNED
+#define VSTORE_dst2 VSTORE_ALIGNED
+#else
+#define VSTORE_dst2 VSTORE_UNALIGNED
+#endif
+#ifdef DST3_ALIGNED
+#define VSTORE_dst3 VSTORE_ALIGNED
+#else
+#define VSTORE_dst3 VSTORE_UNALIGNED
+#endif
-__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
- __global short *mat_dst0, int dst0_step, int dst0_offset,
- __global short *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
+__kernel void split_vector(
+ __global SRC_TYPE* src, int srcStepBytes, int2 srcOffset, // offset.x in bytes
+ __global TYPE* dst0, int dst0StepBytes, int2 dst0Offset,
+ __global TYPE* dst1, int dst1StepBytes, int2 dst1Offset,
+#if DATA_CHAN > 2
+ __global TYPE* dst2, int dst2StepBytes, int2 dst2Offset,
+#endif
+#if DATA_CHAN > 3
+ __global TYPE* dst3, int dst3StepBytes, int2 dst3Offset,
+#endif
+ int2 size)
{
- int x = get_global_id(0);
+ int x = get_global_id(0) * VEC_SIZE;
int y = get_global_id(1);
- if((x < cols) && (y < rows))
+ if (x < size.x && y < size.y)
{
- x = x << 1;
-
- #define dst0_align ((dst0_offset & 3) << 1)
- #define dst1_align ((dst1_offset & 3) << 1)
- int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
- int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
-
- int dst0_start = mad24(y, dst0_step, dst0_offset);
- int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
- int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
-
- int dst1_start = mad24(y, dst1_step, dst1_offset);
- int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
- int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
- int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
- int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
- short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
- short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
- if(src_idx_0 < 0)
+ SRC_TYPE srcData[VEC_SIZE];
+ int xOffsetLimitBytes = srcOffset.x + size.x * sizeof(SRC_TYPE);
+ int xOffsetBytes = srcOffset.x + x * sizeof(SRC_TYPE);
+ int yOffsetBytes = (srcOffset.y + y) * srcStepBytes;
+#pragma unroll
+ for (int i = 0; i < VEC_SIZE; i++, xOffsetBytes += sizeof(SRC_TYPE))
{
- short4 tmp;
- tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
- src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
+ srcData[i] = (xOffsetBytes >= xOffsetLimitBytes) ? (SRC_TYPE)0 :
+ *(__global SRC_TYPE*)((__global char*)src + yOffsetBytes + xOffsetBytes);
}
- if(src_idx_1< 0)
- {
- short4 tmp;
- tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
- src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
- }
-
-
- short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
- short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
-
- short2 tmp_data0, tmp_data1;
-
- tmp_data0.x = ((dst0_idx + 0 >= dst0_start) && (dst0_idx + 0 < dst0_end)) ? src_data_0.x : dst0_data.x;
- tmp_data0.y = ((dst0_idx + 2 >= dst0_start) && (dst0_idx + 2 < dst0_end)) ? src_data_0.z : dst0_data.y;
- tmp_data1.x = ((dst1_idx + 0 >= dst1_start) && (dst1_idx + 0 < dst1_end)) ? src_data_1.y : dst1_data.x;
- tmp_data1.y = ((dst1_idx + 2 >= dst1_start) && (dst1_idx + 2 < dst1_end)) ? src_data_1.w : dst1_data.y;
-
- *((global short2 *)((__global char *)mat_dst0 + dst0_idx)) = tmp_data0;
- *((global short2 *)((__global char *)mat_dst1 + dst1_idx)) = tmp_data1;
- }
-}
-__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- __global int *mat_dst2, int dst2_step, int dst2_offset,
- __global int *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
- int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
- int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- ((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
- ((__global int *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
- }
-}
-__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- __global int *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
-
- int src_data_0 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 0];
- int src_data_1 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 1];
- int src_data_2 = ((__global int *)((__global char *)mat_src + src_idx))[3 * x + 2];
-
- ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
- ((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
- ((__global int *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
- }
-}
-
-__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
- __global int *mat_dst0, int dst0_step, int dst0_offset,
- __global int *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
- int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global int *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- }
-}
-
-__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- __global float *mat_dst2, int dst2_step, int dst2_offset,
- __global float *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
- int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
- float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- ((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
- ((__global float *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
- }
-}
-
-__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- __global float *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
-
- float src_data_0 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 0];
- float src_data_1 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 1];
- float src_data_2 = ((__global float *)((__global char *)mat_src + src_idx))[3 * x + 2];
-
- ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
- ((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
- ((__global float *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
- }
-}
-
-__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
- __global float *mat_dst0, int dst0_step, int dst0_offset,
- __global float *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
-
- float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global float *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- }
-}
-
-#if defined (DOUBLE_SUPPORT)
-__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- __global double *mat_dst2, int dst2_step, int dst2_offset,
- __global double *mat_dst3, int dst3_step, int dst3_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
- int dst3_idx = mad24(y, dst3_step, dst3_offset);
-
- double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
-
- ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
- ((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data.z;
- ((__global double *)((__global char *)mat_dst3 + dst3_idx))[x] = src_data.w;
- }
-}
-
-__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- __global double *mat_dst2, int dst2_step, int dst2_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
- int dst2_idx = mad24(y, dst2_step, dst2_offset);
-
- double src_data_0 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 0];
- double src_data_1 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 1];
- double src_data_2 = ((__global double *)((__global char *)mat_src + src_idx))[3 * x + 2];
-
- ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data_0;
- ((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data_1;
- ((__global double *)((__global char *)mat_dst2 + dst2_idx))[x] = src_data_2;
- }
-}
-
-__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
- __global double *mat_dst0, int dst0_step, int dst0_offset,
- __global double *mat_dst1, int dst1_step, int dst1_offset,
- int rows, int cols, int dst_step1)
-
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
-
- if((x < cols) && (y < rows))
- {
- int src_idx = mad24(y, src_step, src_offset);
- int dst0_idx = mad24(y, dst0_step, dst0_offset);
- int dst1_idx = mad24(y, dst1_step, dst1_offset);
+#if VEC_SIZE == 1
+ TYPE dstC0 = srcData[0].s0;
+ TYPE dstC1 = srcData[0].s1;
+#if DATA_CHAN > 2
+ TYPE dstC2 = srcData[0].s2;
+#endif
+#if DATA_CHAN > 3
+ TYPE dstC3 = srcData[0].s3;
+#endif
+# define VEC_TO_ARRAY(v, a) TYPE a[1] = {v};
+#elif VEC_SIZE == 2
+ DST_VEC_TYPE dstC0 = (DST_VEC_TYPE)(srcData[0].s0, srcData[1].s0);
+ DST_VEC_TYPE dstC1 = (DST_VEC_TYPE)(srcData[0].s1, srcData[1].s1);
+#if DATA_CHAN > 2
+ DST_VEC_TYPE dstC2 = (DST_VEC_TYPE)(srcData[0].s2, srcData[1].s2);
+#endif
+#if DATA_CHAN > 3
+ DST_VEC_TYPE dstC3 = (DST_VEC_TYPE)(srcData[0].s3, srcData[1].s3);
+#endif
+# define VEC_TO_ARRAY(v, a) TYPE a[2] = {v.s0, v.s1};
+#elif VEC_SIZE == 4
+ DST_VEC_TYPE dstC0 = (DST_VEC_TYPE)(srcData[0].s0, srcData[1].s0, srcData[2].s0, srcData[3].s0);
+ DST_VEC_TYPE dstC1 = (DST_VEC_TYPE)(srcData[0].s1, srcData[1].s1, srcData[2].s1, srcData[3].s1);
+#if DATA_CHAN > 2
+ DST_VEC_TYPE dstC2 = (DST_VEC_TYPE)(srcData[0].s2, srcData[1].s2, srcData[2].s2, srcData[3].s2);
+#endif
+#if DATA_CHAN > 3
+ DST_VEC_TYPE dstC3 = (DST_VEC_TYPE)(srcData[0].s3, srcData[1].s3, srcData[2].s3, srcData[3].s3);
+#endif
+# define VEC_TO_ARRAY(v, a) TYPE a[4] = {v.s0, v.s1, v.s2, v.s3};
+#endif
- double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
+#ifndef BYPASS_VSTORE
+#define BYPASS_VSTORE false
+#endif
- ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
- ((__global double *)((__global char *)mat_dst1 + dst1_idx))[x] = src_data.y;
+#define WRITE_VEC_DST(dst, vecValue) \
+{ \
+ int dst ## xOffsetLimitBytes = dst ## Offset.x + size.x * sizeof(TYPE); \
+ int dst ## xOffsetBytes = dst ## Offset.x + x * sizeof(TYPE); \
+ int dst ## yOffsetBytes = (dst ## Offset.y + y) * dst ## StepBytes; \
+ if (!BYPASS_VSTORE && dst ## xOffsetBytes + sizeof(DST_VEC_TYPE) <= dst ## xOffsetLimitBytes) \
+ { \
+ VSTORE_ ## dst(((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes), vecValue); \
+ } \
+ else \
+ { \
+ VEC_TO_ARRAY(vecValue, vecValue##Array); \
+ for (int i = 0; i < VEC_SIZE; i++, dst ## xOffsetBytes += sizeof(TYPE)) \
+ { \
+ if (dst ## xOffsetBytes + sizeof(TYPE) <= dst ## xOffsetLimitBytes) \
+ *(__global TYPE*)((__global char*)dst + dst ## yOffsetBytes + dst ## xOffsetBytes) = vecValue##Array[i]; \
+ else \
+ break; \
+ } \
+ } \
+}
+
+ WRITE_VEC_DST(dst0, dstC0);
+ WRITE_VEC_DST(dst1, dstC1);
+#if DATA_CHAN > 2
+ WRITE_VEC_DST(dst2, dstC2);
+#endif
+#if DATA_CHAN > 3
+ WRITE_VEC_DST(dst3, dstC3);
+#endif
}
}
-#endif
static inline void ___openCLSafeCall(int err, const char *file, const int line, const char *func = "")
{
- if( CL_SUCCESS != err)
+ if (CL_SUCCESS != err)
cv::ocl::error(getOpenCLErrorString(err), file, line, func);
}
}
mat_dst.create(size, CV_MAKETYPE(depth, total_channels));
merge_vector_run(mat_src, n, mat_dst);
}
- static void split_vector_run(const oclMat &mat_src, oclMat *mat_dst)
+ static void split_vector_run(const oclMat &src, oclMat *dst)
{
- if(!mat_src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && mat_src.type() == CV_64F)
+ if(!src.clCxt->supportsFeature(FEATURE_CL_DOUBLE) && src.type() == CV_64F)
{
CV_Error(CV_OpenCLDoubleNotSupported, "Selected device doesn't support double");
return;
}
- Context *clCxt = mat_src.clCxt;
- int channels = mat_src.oclchannels();
- int depth = mat_src.depth();
+ Context *clCtx = src.clCxt;
+ int channels = src.channels();
+ int depth = src.depth();
+ depth = (depth == CV_8S) ? CV_8U : depth;
+ depth = (depth == CV_16S) ? CV_16U : depth;
string kernelName = "split_vector";
- int vector_lengths[4][7] = {{0, 0, 0, 0, 0, 0, 0},
- {4, 4, 2, 2, 1, 1, 1},
- {4, 4, 2, 2 , 1, 1, 1},
- {4, 4, 2, 2, 1, 1, 1}
- };
-
- size_t vector_length = vector_lengths[channels - 1][mat_dst[0].depth()];
-
- int max_offset_cols = 0;
- for(int i = 0; i < channels; i++)
- {
- int offset_cols = (mat_dst[i].offset / mat_dst[i].elemSize()) & (vector_length - 1);
- if(max_offset_cols < offset_cols)
- max_offset_cols = offset_cols;
- }
-
- int cols = vector_length == 1 ? divUp(mat_src.cols, vector_length)
- : divUp(mat_src.cols + max_offset_cols, vector_length);
-
- size_t localThreads[3] = { 64, 4, 1 };
- size_t globalThreads[3] = { cols, mat_src.rows, 1 };
+ size_t VEC_SIZE = 4;
- int dst_step1 = mat_dst[0].cols * mat_dst[0].elemSize();
vector<pair<size_t , const void *> > args;
- args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_src.data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.offset));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[0].data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[0].step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[0].offset));
- args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[1].data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[1].step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[1].offset));
- if(channels >= 3)
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&src.step));
+ int srcOffsetXBytes = src.offset % src.step;
+ int srcOffsetY = src.offset / src.step;
+ cl_int2 srcOffset = {{srcOffsetXBytes, srcOffsetY}};
+ args.push_back( make_pair( sizeof(cl_int2), (void *)&srcOffset));
+
+ bool dst0Aligned = false, dst1Aligned = false, dst2Aligned = false, dst3Aligned = false;
+ int alignSize = dst[0].elemSize1() * VEC_SIZE;
+ int alignMask = alignSize - 1;
+
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[0].data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&dst[0].step));
+ int dst0OffsetXBytes = dst[0].offset % dst[0].step;
+ int dst0OffsetY = dst[0].offset / dst[0].step;
+ cl_int2 dst0Offset = {{dst0OffsetXBytes, dst0OffsetY}};
+ args.push_back( make_pair( sizeof(cl_int2), (void *)&dst0Offset));
+ if ((dst0OffsetXBytes & alignMask) == 0)
+ dst0Aligned = true;
+
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[1].data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&dst[1].step));
+ int dst1OffsetXBytes = dst[1].offset % dst[1].step;
+ int dst1OffsetY = dst[1].offset / dst[1].step;
+ cl_int2 dst1Offset = {{dst1OffsetXBytes, dst1OffsetY}};
+ args.push_back( make_pair( sizeof(cl_int2), (void *)&dst1Offset));
+ if ((dst1OffsetXBytes & alignMask) == 0)
+ dst1Aligned = true;
+
+ // DON'T MOVE VARIABLES INTO 'IF' BODY
+ int dst2OffsetXBytes, dst2OffsetY;
+ cl_int2 dst2Offset;
+ int dst3OffsetXBytes, dst3OffsetY;
+ cl_int2 dst3Offset;
+ if (channels >= 3)
{
-
- args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[2].data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[2].step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[2].offset));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[2].data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&dst[2].step));
+ dst2OffsetXBytes = dst[2].offset % dst[2].step;
+ dst2OffsetY = dst[2].offset / dst[2].step;
+ dst2Offset.s[0] = dst2OffsetXBytes; dst2Offset.s[1] = dst2OffsetY;
+ args.push_back( make_pair( sizeof(cl_int2), (void *)&dst2Offset));
+ if ((dst2OffsetXBytes & alignMask) == 0)
+ dst2Aligned = true;
}
- if(channels >= 4)
+
+ if (channels >= 4)
{
- args.push_back( make_pair( sizeof(cl_mem), (void *)&mat_dst[3].data));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[3].step));
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_dst[3].offset));
+ args.push_back( make_pair( sizeof(cl_mem), (void *)&dst[3].data));
+ args.push_back( make_pair( sizeof(cl_int), (void *)&dst[3].step));
+ dst3OffsetXBytes = dst[3].offset % dst[3].step;
+ dst3OffsetY = dst[3].offset / dst[3].step;
+ dst3Offset.s[0] = dst3OffsetXBytes; dst3Offset.s[1] = dst3OffsetY;
+ args.push_back( make_pair( sizeof(cl_int2), (void *)&dst3Offset));
+ if ((dst3OffsetXBytes & alignMask) == 0)
+ dst3Aligned = true;
}
- args.push_back( make_pair( sizeof(cl_int), (void *)&mat_src.rows));
- args.push_back( make_pair( sizeof(cl_int), (void *)&cols));
- args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1));
-
- openCLExecuteKernel(clCxt, &split_mat, kernelName, globalThreads, localThreads, args, channels, depth);
+ cl_int2 size = {{ src.cols, src.rows }};
+ args.push_back( make_pair( sizeof(cl_int2), (void *)&size));
+
+ string build_options =
+ cv::format("-D VEC_SIZE=%d -D DATA_DEPTH=%d -D DATA_CHAN=%d",
+ (int)VEC_SIZE, depth, channels);
+
+ if (dst0Aligned)
+ build_options += " -D DST0_ALIGNED";
+ if (dst1Aligned)
+ build_options += " -D DST1_ALIGNED";
+ if (dst2Aligned)
+ build_options += " -D DST2_ALIGNED";
+ if (dst3Aligned)
+ build_options += " -D DST3_ALIGNED";
+
+ const DeviceInfo& devInfo = clCtx->getDeviceInfo();
+
+ // TODO Workaround for issues. Need to investigate a problem.
+ if (channels == 2
+ && devInfo.deviceType == CVCL_DEVICE_TYPE_CPU
+ && devInfo.platform->platformVendor.find("Intel") != std::string::npos
+ && (devInfo.deviceVersion.find("Build 56860") != std::string::npos
+ || devInfo.deviceVersion.find("Build 76921") != std::string::npos))
+ build_options += " -D BYPASS_VSTORE=true";
+
+ size_t globalThreads[3] = { divUp(src.cols, VEC_SIZE), src.rows, 1 };
+ openCLExecuteKernel(clCtx, &split_mat, kernelName, globalThreads, NULL, args, -1, -1, build_options.c_str());
}
static void split(const oclMat &mat_src, oclMat *mat_dst)
{
CV_Assert(mat_dst);
int depth = mat_src.depth();
- int num_channels = mat_src.oclchannels();
+ int num_channels = mat_src.channels();
Size size = mat_src.size();
- if(num_channels == 1)
+ if (num_channels == 1)
{
mat_src.copyTo(mat_dst[0]);
return;
}
- int i;
- for(i = 0; i < num_channels; i++)
+ for (int i = 0; i < mat_src.oclchannels(); i++)
mat_dst[i].create(size, CV_MAKETYPE(depth, 1));
split_vector_run(mat_src, mat_dst);
}
void cv::ocl::split(const oclMat &src, vector<oclMat> &dst)
{
- dst.resize(src.oclchannels());
+ dst.resize(src.oclchannels()); // TODO Why oclchannels?
if(src.oclchannels() > 0)
split_merge::split(src, &dst[0]);
}
GaussianBlur(src_roi, dst_roi, Size(ksize, ksize), sigma1, sigma2, borderType);
ocl::GaussianBlur(gsrc_roi, gdst_roi, Size(ksize, ksize), sigma1, sigma2, borderType);
- Near();
+ Near(CV_MAT_DEPTH(type) == CV_8U ? 3 : 1e-6, false);
}
}
(int)BORDER_REFLECT|BORDER_ISOLATED, (int)BORDER_WRAP|BORDER_ISOLATED, \
(int)BORDER_REFLECT_101|BORDER_ISOLATED*/) // WRAP and ISOLATED are not supported by cv:: version
+#define FILTER_DATATYPES Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4, \
+ CV_32FC1, CV_32FC3, CV_32FC4, \
+ CV_64FC1, CV_64FC3, CV_64FC4)
INSTANTIATE_TEST_CASE_P(Filter, Blur, Combine(
- Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4),
+ FILTER_DATATYPES,
Values(3, 5, 7),
Values(Size(0, 0)), // not used
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
Bool()));
INSTANTIATE_TEST_CASE_P(Filter, LaplacianTest, Combine(
- Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
+ FILTER_DATATYPES,
Values(1, 3),
Values(Size(0, 0)), // not used
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
Bool()));
INSTANTIATE_TEST_CASE_P(Filter, Filter2D, testing::Combine(
- Values(CV_8UC1, CV_32FC1, CV_32FC4),
+ FILTER_DATATYPES,
Values(3, 15), // TODO 25: CPU implementation has some issues
Values(Size(-1, -1), Size(0, 0), Size(2, 1)), // anchor
FILTER_BORDER_SET_NO_WRAP_NO_ISOLATED,
Bool()));
INSTANTIATE_TEST_CASE_P(Filter, MedianFilter, Combine(
- Values((MatType)CV_8UC1, (MatType)CV_8UC4, (MatType)CV_32FC1, (MatType)CV_32FC4),
+ Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
Values(3, 5),
Values(Size(0, 0)), // not used
Values(0), // not used
useRoi = GET_PARAM(3);
}
- void random_roi()
+ virtual void random_roi()
{
Size roiSize = randomSize(1, MAX_VALUE);
Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
////////////////////////////////cornerMinEigenVal//////////////////////////////////////////
-typedef ImgprocTestBase CornerMinEigenVal;
+struct CornerTestBase :
+ public ImgprocTestBase
+{
+ virtual void random_roi()
+ {
+ Mat image = readImageType("gpu/stereobm/aloe-L.png", type);
+ ASSERT_FALSE(image.empty());
+
+ Size roiSize = image.size();
+ Border srcBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+
+ Size wholeSize = Size(roiSize.width + srcBorder.lef + srcBorder.rig, roiSize.height + srcBorder.top + srcBorder.bot);
+ src = randomMat(wholeSize, type, -255, 255, false);
+ src_roi = src(Rect(srcBorder.lef, srcBorder.top, roiSize.width, roiSize.height));
+ image.copyTo(src_roi);
+
+ Border dstBorder = randomBorder(0, useRoi ? MAX_VALUE : 0);
+ randomSubMat(dst_whole, dst_roi, roiSize, dstBorder, CV_32FC1, 5, 16);
+
+ generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
+ generateOclMat(gdst_whole, gdst_roi, dst_whole, roiSize, dstBorder);
+ }
+};
+
+typedef CornerTestBase CornerMinEigenVal;
OCL_TEST_P(CornerMinEigenVal, Mat)
{
cornerMinEigenVal(src_roi, dst_roi, blockSize, apertureSize, borderType);
ocl::cornerMinEigenVal(gsrc_roi, gdst_roi, blockSize, apertureSize, borderType);
- Near(1.0);
+ Near(0.02);
}
}
////////////////////////////////cornerHarris//////////////////////////////////////////
-typedef ImgprocTestBase CornerHarris;
+typedef CornerTestBase CornerHarris;
OCL_TEST_P(CornerHarris, Mat)
{
random_roi();
int apertureSize = 3;
- double k = 2.0;
+ double k = randomDouble(0.01, 0.9);
cornerHarris(src_roi, dst_roi, blockSize, apertureSize, k, borderType);
ocl::cornerHarris(gsrc_roi, gdst_roi, blockSize, apertureSize, k, borderType);
- Near(1.0);
+ Near(0.02);
}
}
Bool()));
INSTANTIATE_TEST_CASE_P(Imgproc, CornerMinEigenVal, Combine(
- Values(CV_8UC1, CV_32FC1),
- Values(3), // TODO some fails when blockSize != 3 (for example 5)
- Values((int)BORDER_REFLECT, (int)BORDER_CONSTANT, (int)BORDER_REPLICATE), // TODO does not work with (int)BORDER_REFLECT101
+ Values((MatType)CV_8UC1, (MatType)CV_32FC1),
+ Values(3, 5),
+ Values((int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT101),
Bool()));
INSTANTIATE_TEST_CASE_P(Imgproc, CornerHarris, Combine(
Values((MatType)CV_8UC1), // TODO does not work properly with CV_32FC1
Values(3, 5),
- Values((int)BORDER_REFLECT101, (int)BORDER_REFLECT, (int)BORDER_CONSTANT, (int)BORDER_REPLICATE),
+ Values( (int)BORDER_CONSTANT, (int)BORDER_REPLICATE, (int)BORDER_REFLECT, (int)BORDER_REFLECT_101),
Bool()));
INSTANTIATE_TEST_CASE_P(Imgproc, Integral, Combine(
- Values((MatType)CV_8UC1), // TODO does work with CV_32F, CV_64F
+ Values((MatType)CV_8UC1), // TODO does not work with CV_32F, CV_64F
Values(0), // not used
Values(0), // not used
Bool()));
int channels;
bool use_roi;
- //src mat
- cv::Mat mat;
-
- //dstmat
- cv::Mat dst[MAX_CHANNELS];
-
- // set up roi
- int roicols, roirows;
- int srcx, srcy;
- int dstx[MAX_CHANNELS];
- int dsty[MAX_CHANNELS];
-
- //src mat with roi
- cv::Mat mat_roi;
-
- //dst mat with roi
- cv::Mat dst_roi[MAX_CHANNELS];
+ cv::Mat src, src_roi;
+ cv::Mat dst[MAX_CHANNELS], dst_roi[MAX_CHANNELS];
- //ocl dst mat for testing
- cv::ocl::oclMat gdst_whole[MAX_CHANNELS];
-
- //ocl mat with roi
- cv::ocl::oclMat gmat;
- cv::ocl::oclMat gdst[MAX_CHANNELS];
+ cv::ocl::oclMat gsrc_whole, gsrc_roi;
+ cv::ocl::oclMat gdst_whole[MAX_CHANNELS], gdst_roi[MAX_CHANNELS];
virtual void SetUp()
{
type = GET_PARAM(0);
channels = GET_PARAM(1);
use_roi = GET_PARAM(2);
-
- cv::Size size(MWIDTH, MHEIGHT);
-
- mat = randomMat(size, CV_MAKETYPE(type, channels), 5, 16, false);
- for (int i = 0; i < channels; ++i)
- dst[i] = randomMat(size, CV_MAKETYPE(type, 1), 5, 16, false); }
+ }
void random_roi()
{
- if (use_roi)
- {
- //randomize ROI
- roicols = rng.uniform(1, mat.cols);
- roirows = rng.uniform(1, mat.rows);
- srcx = rng.uniform(0, mat.cols - roicols);
- srcy = rng.uniform(0, mat.rows - roirows);
-
- for (int i = 0; i < channels; ++i)
- {
- dstx[i] = rng.uniform(0, dst[i].cols - roicols);
- dsty[i] = rng.uniform(0, dst[i].rows - roirows);
- }
- }
- else
- {
- roicols = mat.cols;
- roirows = mat.rows;
- srcx = srcy = 0;
-
- for (int i = 0; i < channels; ++i)
- dstx[i] = dsty[i] = 0;
- }
-
- mat_roi = mat(Rect(srcx, srcy, roicols, roirows));
-
- for (int i = 0; i < channels; ++i)
- dst_roi[i] = dst[i](Rect(dstx[i], dsty[i], roicols, roirows));
+ Size roiSize = randomSize(1, MAX_VALUE);
+ Border srcBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+ randomSubMat(src, src_roi, roiSize, srcBorder, CV_MAKETYPE(type, channels), 0, 256);
+ generateOclMat(gsrc_whole, gsrc_roi, src, roiSize, srcBorder);
for (int i = 0; i < channels; ++i)
{
- gdst_whole[i] = dst[i];
- gdst[i] = gdst_whole[i](Rect(dstx[i], dsty[i], roicols, roirows));
+ Border dstBorder = randomBorder(0, use_roi ? MAX_VALUE : 0);
+ randomSubMat(dst[i], dst_roi[i], roiSize, dstBorder, CV_MAKETYPE(type, 1), 5, 16);
+ generateOclMat(gdst_whole[i], gdst_roi[i], dst[i], roiSize, dstBorder);
}
-
- gmat = mat_roi;
}
};
{
random_roi();
- cv::split(mat_roi, dst_roi);
- cv::ocl::split(gmat, gdst);
+ cv::split(src_roi, dst_roi);
+ cv::ocl::split(gsrc_roi, gdst_roi);
for (int i = 0; i < channels; ++i)
- EXPECT_MAT_NEAR(dst[i], Mat(gdst_whole[i]), 0.0);
+ {
+ EXPECT_MAT_NEAR(dst[i], gdst_whole[i], 0.0);
+ EXPECT_MAT_NEAR(dst_roi[i], gdst_roi[i], 0.0);
+ }
}
}
void showDiff(const Mat& gold, const Mat& actual, double eps, bool alwaysShow)
{
- Mat diff;
+ Mat diff, diff_thresh;
absdiff(gold, actual, diff);
diff.convertTo(diff, CV_32F);
- threshold(diff, diff, eps, 255.0, cv::THRESH_BINARY);
+ threshold(diff, diff_thresh, eps, 255.0, cv::THRESH_BINARY);
- if (alwaysShow || cv::countNonZero(diff.reshape(1)) > 0)
+ if (alwaysShow || cv::countNonZero(diff_thresh.reshape(1)) > 0)
{
namedWindow("gold", WINDOW_NORMAL);
namedWindow("actual", WINDOW_NORMAL);
{ \
ASSERT_EQ(mat1.type(), mat2.type()); \
ASSERT_EQ(mat1.size(), mat2.size()); \
- EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps); \
+ EXPECT_LE(checkNorm(cv::Mat(mat1), cv::Mat(mat2)), eps) \
+ << cv::format("Size: %d x %d", mat1.cols, mat1.rows) << std::endl; \
}
#define EXPECT_MAT_NEAR_RELATIVE(mat1, mat2, eps) \
{ \
ASSERT_EQ(mat1.type(), mat2.type()); \
ASSERT_EQ(mat1.size(), mat2.size()); \
- EXPECT_LE(checkNormRelative(cv::Mat(mat1), cv::Mat(mat2)), eps); \
+ EXPECT_LE(checkNormRelative(cv::Mat(mat1), cv::Mat(mat2)), eps) \
+ << cv::format("Size: %d x %d", mat1.cols, mat1.rows) << std::endl; \
}
#define EXPECT_MAT_SIMILAR(mat1, mat2, eps) \
#include "perf_precomp.hpp"
-#ifdef HAVE_OPENCL
+#ifdef HAVE_OPENCV_OCL
#include "opencv2/ocl/ocl.hpp"
using namespace std;
// handle the case when there will be measurement before the next predict.
statePre.copyTo(statePost);
+ errorCovPre.copyTo(errorCovPost);
return statePre;
}
target_link_libraries(${the_target} opencv_nonfree)
endif()
- if(HAVE_OPENCL)
+ if(HAVE_opencv_ocl)
target_link_libraries(${the_target} opencv_ocl)
endif()
else
frame0.copyTo(frameCopy);
getFlowField(flow_vec[0], flow_vec[1], show_flow);
- imshow("PyrLK [Sparse]", show_flow);
+ imshow("tvl1 optical flow field", show_flow);
}
if( waitKey( 10 ) >= 0 )