CV_EXPORTS const char* typeToStr(int t);
CV_EXPORTS const char* memopTypeToStr(int t);
CV_EXPORTS const char* vecopTypeToStr(int t);
+CV_EXPORTS const char* getOpenCLErrorString(int errorCode);
CV_EXPORTS String kernelToStr(InputArray _kernel, int ddepth = -1, const char * name = NULL);
CV_EXPORTS void getPlatfomsInfo(std::vector<PlatformInfo>& platform_info);
Impl* p;
};
+class CV_EXPORTS Timer
+{
+public:
+ Timer(const Queue& q);
+ ~Timer();
+ void start();
+ void stop();
+ float milliSeconds();
+ float microSeconds();
+ float seconds();
+
+protected:
+ struct Impl;
+ Impl* p;
+};
CV_EXPORTS MatAllocator* getOpenCLAllocator();
--- /dev/null
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CONFIGURATION_PRIVATE_HPP
+#define OPENCV_CONFIGURATION_PRIVATE_HPP
+
+namespace cv { namespace utils {
+
+CV_EXPORTS bool getConfigurationParameterBool(const char* name, bool defaultValue);
+CV_EXPORTS size_t getConfigurationParameterSizeT(const char* name, size_t defaultValue);
+CV_EXPORTS cv::String getConfigurationParameterString(const char* name, const char* defaultValue);
+
+}} // namespace
+
+#endif // OPENCV_CONFIGURATION_PRIVATE_HPP
#include <inttypes.h>
#endif
+#include <opencv2/core/utils/configuration.private.hpp>
+
#include "opencv2/core/ocl_genbase.hpp"
+#include "opencl_kernels_core.hpp"
#define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
#define CV_OPENCL_SHOW_RUN_ERRORS 0
return buf;
}
+const char* getOpenCLErrorString(int errorCode)
+{
+ switch (errorCode)
+ {
+ case 0: return "CL_SUCCESS";
+ case -1: return "CL_DEVICE_NOT_FOUND";
+ case -2: return "CL_DEVICE_NOT_AVAILABLE";
+ case -3: return "CL_COMPILER_NOT_AVAILABLE";
+ case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+ case -5: return "CL_OUT_OF_RESOURCES";
+ case -6: return "CL_OUT_OF_HOST_MEMORY";
+ case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+ case -8: return "CL_MEM_COPY_OVERLAP";
+ case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+ case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+ case -11: return "CL_BUILD_PROGRAM_FAILURE";
+ case -12: return "CL_MAP_FAILURE";
+ case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+ case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+ case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+ case -16: return "CL_LINKER_NOT_AVAILABLE";
+ case -17: return "CL_LINK_PROGRAM_FAILURE";
+ case -18: return "CL_DEVICE_PARTITION_FAILED";
+ case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+ case -30: return "CL_INVALID_VALUE";
+ case -31: return "CL_INVALID_DEVICE_TYPE";
+ case -32: return "CL_INVALID_PLATFORM";
+ case -33: return "CL_INVALID_DEVICE";
+ case -34: return "CL_INVALID_CONTEXT";
+ case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+ case -36: return "CL_INVALID_COMMAND_QUEUE";
+ case -37: return "CL_INVALID_HOST_PTR";
+ case -38: return "CL_INVALID_MEM_OBJECT";
+ case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+ case -40: return "CL_INVALID_IMAGE_SIZE";
+ case -41: return "CL_INVALID_SAMPLER";
+ case -42: return "CL_INVALID_BINARY";
+ case -43: return "CL_INVALID_BUILD_OPTIONS";
+ case -44: return "CL_INVALID_PROGRAM";
+ case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+ case -46: return "CL_INVALID_KERNEL_NAME";
+ case -47: return "CL_INVALID_KERNEL_DEFINITION";
+ case -48: return "CL_INVALID_KERNEL";
+ case -49: return "CL_INVALID_ARG_INDEX";
+ case -50: return "CL_INVALID_ARG_VALUE";
+ case -51: return "CL_INVALID_ARG_SIZE";
+ case -52: return "CL_INVALID_KERNEL_ARGS";
+ case -53: return "CL_INVALID_WORK_DIMENSION";
+ case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+ case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+ case -56: return "CL_INVALID_GLOBAL_OFFSET";
+ case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+ case -58: return "CL_INVALID_EVENT";
+ case -59: return "CL_INVALID_OPERATION";
+ case -60: return "CL_INVALID_GL_OBJECT";
+ case -61: return "CL_INVALID_BUFFER_SIZE";
+ case -62: return "CL_INVALID_MIP_LEVEL";
+ case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+ case -64: return "CL_INVALID_PROPERTY";
+ case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+ case -66: return "CL_INVALID_COMPILER_OPTIONS";
+ case -67: return "CL_INVALID_LINKER_OPTIONS";
+ case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+ case -69: return "CL_INVALID_PIPE_SIZE";
+ case -70: return "CL_INVALID_DEVICE_QUEUE";
+ case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+ case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+ case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+ case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+ case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+ case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+ case -1024: return "clBLAS: Functionality is not implemented";
+ case -1023: return "clBLAS: Library is not initialized yet";
+ case -1022: return "clBLAS: Matrix A is not a valid memory object";
+ case -1021: return "clBLAS: Matrix B is not a valid memory object";
+ case -1020: return "clBLAS: Matrix C is not a valid memory object";
+ case -1019: return "clBLAS: Vector X is not a valid memory object";
+ case -1018: return "clBLAS: Vector Y is not a valid memory object";
+ case -1017: return "clBLAS: An input dimension (M:N:K) is invalid";
+ case -1016: return "clBLAS: Leading dimension A must not be less than the "
+ "size of the first dimension";
+ case -1015: return "clBLAS: Leading dimension B must not be less than the "
+ "size of the second dimension";
+ case -1014: return "clBLAS: Leading dimension C must not be less than the "
+ "size of the third dimension";
+ case -1013: return "clBLAS: The increment for a vector X must not be 0";
+ case -1012: return "clBLAS: The increment for a vector Y must not be 0";
+ case -1011: return "clBLAS: The memory object for Matrix A is too small";
+ case -1010: return "clBLAS: The memory object for Matrix B is too small";
+ case -1009: return "clBLAS: The memory object for Matrix C is too small";
+ case -1008: return "clBLAS: The memory object for Vector X is too small";
+ case -1007: return "clBLAS: The memory object for Vector Y is too small";
+ default: return "Unknown OpenCL error";
+ }
+}
+
template <typename T>
static std::string kerToStr(const Mat & k)
{
return true;
}
+struct Timer::Impl
+{
+ const Queue queue;
+
+ Impl(const Queue& q)
+ : queue(q)
+ , initted_(false)
+ , running_(false)
+ , has_run_at_least_once_(false)
+ {
+ init();
+ }
+
+ ~Impl()
+ {
+ clWaitForEvents(1, &start_gpu_cl_);
+ clWaitForEvents(1, &stop_gpu_cl_);
+ clReleaseEvent(start_gpu_cl_);
+ clReleaseEvent(stop_gpu_cl_);
+ }
+
+ void start()
+ {
+#ifdef HAVE_OPENCL
+ if (!running())
+ {
+ clWaitForEvents(1, &start_gpu_cl_);
+ clReleaseEvent(start_gpu_cl_);
+ ocl::Kernel kernel("null_kernel_float", ocl::core::benchmark_oclsrc);
+ float arg = 0;
+ clSetKernelArg((cl_kernel)kernel.ptr(), 0, sizeof(arg), &arg);
+ clEnqueueTask((cl_command_queue)queue.ptr(), (cl_kernel)kernel.ptr(), 0,
+ NULL, &start_gpu_cl_);
+ clFinish((cl_command_queue)queue.ptr());
+ running_ = true;
+ has_run_at_least_once_ = true;
+ }
+#endif
+ }
+
+ void stop()
+ {
+#ifdef HAVE_OPENCL
+ if (running())
+ {
+ clWaitForEvents(1, &stop_gpu_cl_);
+ clReleaseEvent(stop_gpu_cl_);
+ ocl::Kernel kernel("null_kernel_float", ocl::core::benchmark_oclsrc);
+ float arg = 0;
+ clSetKernelArg((cl_kernel)kernel.ptr(), 0, sizeof(arg), &arg);
+ clEnqueueTask((cl_command_queue)queue.ptr(), (cl_kernel)kernel.ptr(), 0,
+ NULL, &stop_gpu_cl_);
+ clFinish((cl_command_queue)queue.ptr());
+ running_ = false;
+ }
+#endif
+ }
+
+ float microSeconds()
+ {
+#ifdef HAVE_OPENCL
+ if (!has_run_at_least_once())
+ {
+ return 0;
+ }
+ if (running())
+ {
+ stop();
+ }
+ cl_ulong startTime, stopTime;
+ clWaitForEvents(1, &stop_gpu_cl_);
+ clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END,
+ sizeof startTime, &startTime, NULL);
+ clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START,
+ sizeof stopTime, &stopTime, NULL);
+ double us = static_cast<double>(stopTime - startTime) / 1000.0;
+ elapsed_microseconds_ = static_cast<float>(us);
+ return elapsed_microseconds_;
+#else
+ return 0;
+#endif
+ }
+
+ float milliSeconds()
+ {
+#ifdef HAVE_OPENCL
+ if (!has_run_at_least_once())
+ {
+ return 0;
+ }
+ if (running())
+ {
+ stop();
+ }
+ cl_ulong startTime = 0, stopTime = 0;
+ clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END,
+ sizeof startTime, &startTime, NULL);
+ clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START,
+ sizeof stopTime, &stopTime, NULL);
+ double ms = static_cast<double>(stopTime - startTime) / 1000000.0;
+ elapsed_milliseconds_ = static_cast<float>(ms);
+ return elapsed_milliseconds_;
+#else
+ return 0;
+#endif
+ }
+
+ float seconds()
+ {
+ return milliSeconds() / 1000.f;
+ }
+
+ void init()
+ {
+ CV_Assert(queue.getImpl() && queue.getImpl()->isProfilingQueue_);
+ if (!initted())
+ {
+ start_gpu_cl_ = 0;
+ stop_gpu_cl_ = 0;
+ initted_ = true;
+ }
+ }
+
+ inline bool initted() { return initted_; }
+ inline bool running() { return running_; }
+ inline bool has_run_at_least_once() { return has_run_at_least_once_; }
+
+ bool initted_;
+ bool running_;
+ bool has_run_at_least_once_;
+ float elapsed_milliseconds_;
+ float elapsed_microseconds_;
+ cl_event start_gpu_cl_;
+ cl_event stop_gpu_cl_;
+};
+
+Timer::Timer(const Queue& q)
+{
+ p = new Impl(q);
+}
+
+Timer::~Timer()
+{
+ if(p)
+ {
+ delete p;
+ p = 0;
+ }
+}
+
+void Timer::start()
+{
+ if(p)
+ p->start();
+}
+
+void Timer::stop()
+{
+ if(p)
+ p->stop();
+}
+
+float Timer::microSeconds()
+{ return p ? p->microSeconds() : 0; }
+
+float Timer::milliSeconds()
+{ return p ? p->milliSeconds() : 0; }
+
+float Timer::seconds()
+{ return p ? p->seconds() : 0; }
+
}}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void null_kernel_float(float arg) {
+ float out = arg;
+}
#define CL_RUNTIME_EXPORT
#endif
-namespace utils {
-bool getConfigurationParameterBool(const char* name, bool defaultValue);
-size_t getConfigurationParameterSizeT(const char* name, size_t defaultValue);
-cv::String getConfigurationParameterString(const char* name, const char* defaultValue);
-}
-
extern bool __termination; // skip some cleanups, because process is terminating
// (for example, if ExitProcess() was already called)
#include "precomp.hpp"
#include <iostream>
+#include <opencv2/core/utils/configuration.private.hpp>
#include <opencv2/core/utils/trace.private.hpp>
namespace cv {
#include <opencv2/core/utils/trace.hpp>
#include <opencv2/core/utils/trace.private.hpp>
+#include <opencv2/core/utils/configuration.private.hpp>
#include <cstdarg> // va_start
UMat hdr;
if(!data)
return hdr;
- Size wholeSize;
- Point ofs;
- locateROI(wholeSize, ofs);
- Size sz(cols, rows);
- if (ofs.x != 0 || ofs.y != 0)
+ if (data != datastart)
{
- Mat src = *this;
- int dtop = ofs.y;
- int dbottom = wholeSize.height - src.rows - ofs.y;
- int dleft = ofs.x;
- int dright = wholeSize.width - src.cols - ofs.x;
- src.adjustROI(dtop, dbottom, dleft, dright);
- return src.getUMat(accessFlags, usageFlags)(cv::Rect(ofs.x, ofs.y, sz.width, sz.height));
+ Size wholeSize;
+ Point ofs;
+ locateROI(wholeSize, ofs);
+ Size sz(cols, rows);
+ if (ofs.x != 0 || ofs.y != 0)
+ {
+ Mat src = *this;
+ int dtop = ofs.y;
+ int dbottom = wholeSize.height - src.rows - ofs.y;
+ int dleft = ofs.x;
+ int dright = wholeSize.width - src.cols - ofs.x;
+ src.adjustROI(dtop, dbottom, dleft, dright);
+ return src.getUMat(accessFlags, usageFlags)(cv::Rect(ofs.x, ofs.y, sz.width, sz.height));
+ }
}
CV_Assert(data == datastart);
)
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4701 /wd4100)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/ocl4dnn/include ${OPENCL_INCLUDE_DIRS})
+
if(MSVC)
add_definitions( -D_CRT_SECURE_NO_WARNINGS=1 )
ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4244 /wd4267 /wd4018 /wd4355 /wd4800 /wd4251 /wd4996 /wd4146
CV_PROP String name; //!< Name of the layer instance, can be used for logging or other internal purposes.
CV_PROP String type; //!< Type name which was used for creating layer by layer factory.
+ CV_PROP int preferableTarget; //!< prefer target for layer forwarding
Layer();
explicit Layer(const LayerParams ¶ms); //!< Initializes only #name, #type and #blobs fields.
--- /dev/null
+#include "../perf_precomp.hpp"
+#include "opencv2/ts/ocl_perf.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+#ifdef HAVE_OPENCL
+
+namespace cvtest
+{
+namespace ocl
+{
+
+using std::tr1::tuple;
+using std::tr1::get;
+using std::tr1::make_tuple;
+using std::make_pair;
+using namespace perf;
+using namespace testing;
+using namespace cv;
+using namespace cv::dnn;
+
+enum {STRIDE_OFF = 1, STRIDE_ON = 2};
+CV_ENUM(StrideSize, STRIDE_OFF, STRIDE_ON);
+
+enum {GROUP_OFF = 1, GROUP_2 = 2};
+CV_ENUM(GroupSize, GROUP_OFF, GROUP_2);
+
+//Squared Size
+#define SSZ(n) cv::Size(n, n)
+
+typedef std::pair<MatShape, int> InpShapeNumOut;
+typedef tuple<Size, InpShapeNumOut, GroupSize, StrideSize> ConvParam; //kernel_size, inp shape, groups, stride
+typedef TestBaseWithParam<ConvParam> ConvolutionPerfTest;
+
+static inline MatShape blobShape(int count, int nplanes, int height, int width)
+{
+ int data[] = {count, nplanes, height, width};
+ return MatShape(data, data+4);
+}
+
+OCL_PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
+ Values(Size(1, 1), Size(3, 3), Size(5, 5), Size(11, 11)),
+ Values(make_pair(blobShape(1, 4, 224, 224), 64),
+ make_pair(blobShape(1, 64, 112, 122), 128),
+ make_pair(blobShape(1, 256, 28, 28), 512)),
+ GroupSize::all(),
+ StrideSize::all())
+)
+{
+ RNG rng(0);
+
+ ConvParam params = GetParam();
+ int ksz = get<0>(params).width;
+ MatShape inpShape = get<1>(params).first;
+ int outCn = get<1>(params).second;
+ int groups = get<2>(params);
+ int stride = (ksz >= 11) ? 4 : (int)get<3>(params);
+
+ int inpCn = inpShape[1];
+ int wgtSize[] = { outCn, inpCn/groups, ksz, ksz };
+ int biasSize[] = { outCn, 1, 1, 1 };
+ const int wtype = CV_32F;
+ Mat wgtBlob(4, wgtSize, wtype), biasBlob(4, biasSize, wtype);
+ Mat inpBlob(4, &inpShape[0], wtype);
+ rng.fill(biasBlob, RNG::UNIFORM, -1, +1);
+ rng.fill(wgtBlob, RNG::UNIFORM, -1, +1);
+ rng.fill(inpBlob, RNG::UNIFORM, -1, +1);
+
+ LayerParams lp;
+ lp.set("num_output", outCn);
+ lp.set("group", groups);
+ lp.set("stride", stride);
+ lp.set("kernel_size", ksz);
+ lp.blobs.reserve(2);
+ lp.blobs.push_back(wgtBlob);
+ lp.blobs.push_back(biasBlob);
+
+ std::vector<Mat*> inpBlobs(1, &inpBlob);
+ std::vector<Mat> outBlobs, internalBlobs;
+
+ cv::setNumThreads(cv::getNumberOfCPUs());
+
+ Ptr<Layer> layer = cv::dnn::LayerFactory::createLayerInstance("Convolution", lp);
+ std::vector<MatShape> inputShapes(1, shape(inpBlob)), outShapes, internals;
+ layer->getMemoryShapes(inputShapes, 0, outShapes, internals);
+ for (int i = 0; i < outShapes.size(); i++)
+ {
+ outBlobs.push_back(Mat(outShapes[i], CV_32F));
+ }
+ for (int i = 0; i < internals.size(); i++)
+ {
+ internalBlobs.push_back(Mat());
+ if (total(internals[i]))
+ internalBlobs.back().create(internals[i], CV_32F);
+ }
+
+ layer->finalize(inpBlobs, outBlobs);
+ layer->preferableTarget = DNN_TARGET_OPENCL;
+
+ Mat inpBlob2D = inpBlob.reshape(1, outCn);
+ Mat wgtBlob2D = wgtBlob.reshape(1, outCn*(inpCn/groups));
+ Mat outBlob2D = outBlobs[0].reshape(1, outBlobs[0].size[0]);
+ declare.in(inpBlob2D, wgtBlob2D, WARMUP_RNG).out(outBlob2D).tbb_threads(cv::getNumThreads());
+
+ // warmup
+ layer->forward(inpBlobs, outBlobs, internalBlobs);
+
+ TEST_CYCLE()
+ {
+ layer->forward(inpBlobs, outBlobs, internalBlobs);
+ }
+
+ SANITY_CHECK_NOTHING();
+}
+
+}
+}
+
+#endif
if (backend == DNN_BACKEND_DEFAULT && target == DNN_TARGET_OPENCL)
{
-#if 0 //defined(HAVE_OPENCL)
+#if defined(HAVE_OPENCL)
if (!cv::ocl::useOpenCL())
#endif
{
if (preferableBackend == DNN_BACKEND_DEFAULT)
{
- CV_Assert(preferableTarget == DNN_TARGET_CPU);
+ CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL);
return;
}
Ptr<Layer> layerPtr = ld.getLayerInstance();
{
layerPtr->finalize(ld.inputBlobs, ld.outputBlobs);
+ layerPtr->preferableTarget = preferableTarget;
#if 0
std::cout << "\toutputs:";
size_t noutputs = ld.outputBlobs.size();
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
{
- if( !fusion || preferableBackend == DNN_BACKEND_HALIDE )
+ if( !fusion || !(preferableBackend == DNN_BACKEND_DEFAULT && preferableTarget == DNN_TARGET_CPU))
return;
CV_TRACE_FUNCTION();
}
layersTimings.resize(lastLayerId + 1, 0);
-
fuseLayers(blobsToKeep_);
}
}
else
{
- CV_Assert(preferableTarget == DNN_TARGET_CPU);
+ CV_Assert(preferableTarget == DNN_TARGET_CPU || preferableTarget == DNN_TARGET_OPENCL);
}
return ld.outputBlobs[pin.oid];
}
Importer::~Importer() {}
-Layer::Layer() {}
+Layer::Layer() { preferableTarget = DNN_TARGET_CPU; }
Layer::Layer(const LayerParams ¶ms)
: blobs(params.blobs), name(params.name), type(params.type)
{
-
+ preferableTarget = DNN_TARGET_CPU;
}
void Layer::setParamsFrom(const LayerParams ¶ms)
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "op_halide.hpp"
+#include "opencl_kernels_dnn.hpp"
namespace cv
{
}
};
+#ifdef HAVE_OPENCL
+ bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ CV_TRACE_FUNCTION();
+ CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+ int cAxis = clamp(axis, inputs[0]->dims);
+ if (!(cAxis == 1 && outputs[0].dims == 4 && !padding))
+ return false;
+
+ int bottom_concat_axis;
+ int concat_size = inputs[0]->size[2] * inputs[0]->size[3];
+ int top_concat_axis = outputs[0].size[1];
+ int offset_concat_axis = 0;
+ UMat inpMat, outMat;
+ outMat = outputs[0].getUMat(ACCESS_WRITE);
+
+ ocl::Kernel kernel;
+ String buildopt = String("-DDtype=") + ocl::typeToStr(inputs[0]->type()) + String(" ");
+ if (!kernel.create("concat", ocl::dnn::concat_oclsrc, buildopt))
+ return false;
+
+ for (size_t i = 0; i < inputs.size(); i++)
+ {
+ inpMat = inputs[i]->getUMat(ACCESS_READ);
+ bottom_concat_axis = inputs[i]->size[1];
+ size_t nthreads = inputs[i]->total();
+
+ kernel.set(0, (int)nthreads);
+ kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat));
+ kernel.set(2, (int)inputs[i]->size[0]);
+ kernel.set(3, (int)concat_size);
+ kernel.set(4, (int)top_concat_axis);
+ kernel.set(5, (int)bottom_concat_axis);
+ kernel.set(6, (int)offset_concat_axis);
+ kernel.set(7, ocl::KernelArg::PtrWriteOnly(outMat));
+
+ if (!kernel.run(1, &nthreads, NULL, false))
+ return false;
+
+ offset_concat_axis += bottom_concat_axis;
+ }
+
+ return true;
+ }
+#endif
+
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+ CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+ OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+ forward_ocl(inputs, outputs, internals))
+
int cAxis = clamp(axis, inputs[0]->dims);
Mat& outMat = outputs[0];
#include "opencv2/core/hal/intrin.hpp"
#include <iostream>
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
+
namespace cv
{
namespace dnn
Ptr<BatchNormLayer> bnorm;
Ptr<ScaleLayer> scaleLayer;
+#ifdef HAVE_OPENCL
+ Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
+ std::vector<UMat> umat_blobs;
+#endif
+
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
{
Size out(outShape[3], outShape[2]);
}
};
+#ifdef HAVE_OPENCL
+ bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ int group = inputs[0]->size[1] / umat_blobs[0].size[1];
+
+ if (convolutionOp.empty())
+ {
+ OCL4DNNConvConfig config;
+ config.in_shape = shape(*inputs[0]);
+ config.out_shape = shape(outputs[0]);
+ config.kernel = kernel;
+ config.pad = pad;
+ config.stride = stride;
+ config.dilation = dilation;
+ config.group = group;
+ config.bias_term = (hasBias()) ? true : false;
+
+ convolutionOp = Ptr<OCL4DNNConvSpatial<float> >(new OCL4DNNConvSpatial<float>(config));
+ }
+
+ for (size_t ii = 0; ii < outputs.size(); ii++)
+ {
+ UMat inpMat, outMat;
+ inpMat = inputs[ii]->getUMat(ACCESS_READ);
+ outMat = outputs[ii].getUMat(ACCESS_WRITE);
+
+ int batch_size = inpMat.size[0];
+
+ if (!convolutionOp->Forward(inpMat, umat_blobs[0], hasBias() ? umat_blobs[1] : UMat(),
+ outMat, batch_size))
+ return false;
+ }
+ return true;
+ }
+#endif
+
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
CV_TRACE_FUNCTION();
int ngroups = inputs[0]->size[1]/blobs[0].size[1];
CV_Assert(outputs[0].size[1] % ngroups == 0);
+ CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+ OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+ forward_ocl(inputs, outputs, internals))
+
int k, outCn = blobs[0].size[0];
if( weightsMat.empty() )
Ptr<BaseConvolutionLayer> ConvolutionLayer::create(const LayerParams ¶ms)
{
- Ptr<BaseConvolutionLayer> l(new ConvolutionLayerImpl);
+ ConvolutionLayerImpl* conv_ptr = new ConvolutionLayerImpl;
+ Ptr<BaseConvolutionLayer> l(conv_ptr);
initConvDeconvLayerFromCaffe(l, params);
+
+#ifdef HAVE_OPENCL
+ size_t n = params.blobs.size();
+ conv_ptr->umat_blobs.resize(n);
+ for (int i = 0; i < n; i++)
+ conv_ptr->umat_blobs[i] = params.blobs[i].getUMat(ACCESS_READ);
+#endif
+
return l;
}
//M*/
#include "../precomp.hpp"
+#include "layers_common.hpp"
#include "op_halide.hpp"
#include "opencv2/imgproc.hpp"
#include <opencv2/dnn/shape_utils.hpp>
+#include "opencl_kernels_dnn.hpp"
+#include <iostream>
namespace cv
{
{
CV_TRACE_FUNCTION();
+ CV_OCL_RUN((this->preferableTarget == DNN_TARGET_OPENCL) &&
+ OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+ func.applyOCL(inputs, outputs, internals))
+
for (size_t i = 0; i < inputs.size(); i++)
{
const Mat &src = *inputs[i];
bool run_parallel;
};
+#ifdef HAVE_OPENCL
+static String oclGetTMacro(const UMat &m)
+{
+ return String("-DT=") + ocl::typeToStr(m.type()) + String(" ");
+}
+#endif
+
struct ReLUFunctor
{
typedef ReLULayer Layer;
}
}
+#ifdef HAVE_OPENCL
+ bool initKernel(ocl::Kernel &ker, const UMat &src) const
+ {
+ const char *buildoptSlope = (slope == 0) ? "-DRELU_NO_SLOPE" : "";
+ String buildopt = oclGetTMacro(src) + buildoptSlope;
+
+ if (!ker.create("ReLUForward", ocl::dnn::activations_oclsrc, buildopt))
+ return false;
+
+ if (slope != 0)
+ ker.set(3, (float)slope);
+
+ return true;
+ }
+
+ bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
+
+ for (size_t i = 0; i < inputs.size(); i++)
+ {
+ UMat src, dst;
+ inputs[i]->copyTo(src);
+ dst = outputs[i].getUMat(ACCESS_WRITE);
+ CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset);
+
+ ocl::Kernel ker;
+ CV_Assert(initKernel(ker, src));
+ ker.set(0, (int)src.total());
+ ker.set(1, ocl::KernelArg::PtrReadOnly(src));
+ ker.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+
+ size_t gSize = src.total();
+ CV_Assert(ker.run(1, &gSize, &wgSize, false));
+ }
+
+ return true;
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
}
}
+#ifdef HAVE_OPENCL
+ bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ // TODO: implement OCL version
+ return false;
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
}
}
+#ifdef HAVE_OPENCL
+ bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ // TODO: implement OCL version
+ return false;
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
}
}
+#ifdef HAVE_OPENCL
+ bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ // TODO: implement OCL version
+ return false;
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
}
}
+#ifdef HAVE_OPENCL
+ bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ // TODO: implement OCL version
+ return false;
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
}
}
+#ifdef HAVE_OPENCL
+ bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ // TODO: implement OCL version
+ return false;
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
}
}
+#ifdef HAVE_OPENCL
+ bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ // TODO: implement OCL version
+ return false;
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
}
}
+#ifdef HAVE_OPENCL
+ bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ // TODO: implement OCL version
+ return false;
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
v_float32x4 s4 = v_setall_f32(s), z = v_setzero_f32();
for( ; i <= len - 16; i += 16 )
{
- v_float32x4 x0 = v_load(ptr + i);
- v_float32x4 x1 = v_load(ptr + i + 4);
- v_float32x4 x2 = v_load(ptr + i + 8);
- v_float32x4 x3 = v_load(ptr + i + 12);
+ v_float32x4 x0 = v_load(srcptr + i);
+ v_float32x4 x1 = v_load(srcptr + i + 4);
+ v_float32x4 x2 = v_load(srcptr + i + 8);
+ v_float32x4 x3 = v_load(srcptr + i + 12);
x0 = v_select(x0 >= z, x0, x0*s4);
x1 = v_select(x1 >= z, x1, x1*s4);
x2 = v_select(x2 >= z, x2, x2*s4);
x3 = v_select(x3 >= z, x3, x3*s4);
- v_store(ptr + i, x0);
- v_store(ptr + i + 4, x1);
- v_store(ptr + i + 8, x2);
- v_store(ptr + i + 12, x3);
+ v_store(dstptr + i, x0);
+ v_store(dstptr + i + 4, x1);
+ v_store(dstptr + i + 8, x2);
+ v_store(dstptr + i + 12, x3);
}
#endif
for( ; i < len; i++ )
}
}
+#ifdef HAVE_OPENCL
+ bool applyOCL(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ // TODO: implement OCL version
+ return false;
+ }
+#endif
+
#ifdef HAVE_HALIDE
void attachHalide(const Halide::Expr& input, Halide::Func& top)
{
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "op_halide.hpp"
+#include "opencl_kernels_dnn.hpp"
#include <opencv2/dnn/shape_utils.hpp>
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
+
namespace cv
{
namespace dnn
public:
enum { VEC_ALIGN = 8 };
+#ifdef HAVE_OPENCL
+ Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
+ std::vector<UMat> umat_blobs;
+#endif
+
FullyConnectedLayerImpl(const LayerParams& params)
{
setParamsFrom(params);
biasMat = blobs[1] = blobs[1].reshape(1, 1);
else
biasMat = Mat::zeros(1, numOutput, weightsMat.type());
+
+#ifdef HAVE_OPENCL
+ size_t n = blobs.size();
+ umat_blobs.resize(n);
+ for (int i = 0; i < n; i++) umat_blobs[i] = blobs[i].getUMat(ACCESS_READ);
+#endif
}
bool getMemoryShapes(const std::vector<MatShape> &inputs,
bool useAVX2;
};
+#ifdef HAVE_OPENCL
+ bool forward_ocl(std::vector<Mat*> &input, std::vector<Mat> &output)
+ {
+ int axisCan = clamp(axis, input[0]->dims);
+ int numOutput = blobs[0].size[0];
+ int innerSize = blobs[0].size[1];
+ int outerSize = input[0]->total(0, axisCan);
+ bool ret = true;
+
+ if (innerProductOp.empty())
+ {
+ OCL4DNNInnerProductConfig config;
+ config.num_output = numOutput;
+ config.bias_term = bias;
+ config.M = outerSize;
+ config.K = innerSize;
+
+ innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
+ }
+
+ UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
+ for (size_t i = 0; i < input.size(); i++)
+ {
+ UMat srcMat, dstMat;
+ srcMat = input[i]->getUMat(ACCESS_READ);
+ dstMat = output[i].getUMat(ACCESS_WRITE);
+ dstMat.setTo(0.0f);
+
+ if (!innerProductOp->Forward(srcMat, umat_blobs[0], (bias) ? umat_blobs[1] : UMat(), dstMat))
+ {
+ ret = false;
+ break;
+ }
+
+ if (bias && (outerSize > 1))
+ {
+ UMat& biases = umat_blobs[1];
+ cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
+ }
+ }
+
+ if (ret) return true;
+
+ UMat& weights = umat_blobs[0];
+ for (size_t i = 0; i < input.size(); i++)
+ {
+ UMat srcMat, dstMat;
+ srcMat = input[i]->reshape(1, outerSize).getUMat(ACCESS_READ);
+ dstMat = output[i].reshape(1, outerSize).getUMat(ACCESS_WRITE);
+
+ cv::gemm(srcMat, weights, 1, noArray(), 0, dstMat, GEMM_2_T);
+
+ if (bias)
+ {
+ UMat& biases = umat_blobs[1];
+ cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
+ }
+ }
+
+ return true;
+ }
+#endif
+
void forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &)
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+ CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+ OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+ forward_ocl(input, output))
+
int axisCan = clamp(axis, input[0]->dims);
int outerSize = input[0]->total(0, axisCan);
#include "layers/layers_common.simd_declarations.hpp"
#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
+#ifdef HAVE_OPENCL
+#include "ocl4dnn.hpp"
+#endif
+
namespace cv
{
namespace dnn
#include "opencv2/imgproc.hpp"
#include "opencv2/dnn/shape_utils.hpp"
#include "opencv2/core/hal/hal.hpp"
+#include "opencl_kernels_dnn.hpp"
#include <algorithm>
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
+
namespace cv
{
namespace dnn
normBySize = params.get<bool>("norm_by_size", true);
}
+#ifdef HAVE_OPENCL
+ Ptr<OCL4DNNLRN<float> > lrnOp;
+#endif
+
virtual bool supportBackend(int backendId)
{
return backendId == DNN_BACKEND_DEFAULT ||
backendId == DNN_BACKEND_HALIDE && haveHalide();
}
+#ifdef HAVE_OPENCL
+ bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ if (lrnOp.empty())
+ {
+ OCL4DNNLRNConfig config;
+ config.lrn_type = type == CHANNEL_NRM ?
+ LRNParameter_NormRegion_ACROSS_CHANNELS :
+ LRNParameter_NormRegion_WITHIN_CHANNEL;
+
+ CHECK_EQ(size % 2, 1)<< "LRN only supports odd values for local_size";
+ config.local_size = size;
+ config.alpha = alpha;
+ config.beta = beta;
+ config.k = bias;
+ CHECK_EQ(4, inputs[0]->dims) << "Input must have 4 axes, "
+ << "corresponding to (num, channels, height, width)";
+ config.batch_size = inputs[0]->size[0];
+ config.channels = inputs[0]->size[1];
+ config.height = inputs[0]->size[2];
+ config.width = inputs[0]->size[3];
+ config.norm_by_size = normBySize;
+
+ lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
+ }
+
+ UMat inpMat, outMat;
+ inpMat = inputs[0]->getUMat(ACCESS_READ);
+ outMat = outputs[0].getUMat(ACCESS_WRITE);
+
+ if (!lrnOp->Forward(inpMat, outMat))
+ return false;
+
+ return true;
+ }
+#endif
+
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
CV_Assert(inputs.size() == outputs.size());
+
+ CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+ OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+ forward_ocl(inputs, outputs, internals))
+
for (int i = 0; i < inputs.size(); i++)
{
CV_Assert(inputs[i]->dims == 4);
#include "layers_common.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "op_halide.hpp"
+#include "opencl_kernels_dnn.hpp"
#include <float.h>
#include <algorithm>
using std::max;
using std::min;
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
namespace cv
{
ceilMode = params.get<bool>("ceil_mode", true);
}
+#ifdef HAVE_OPENCL
+ Ptr<OCL4DNNPool<float> > poolOp;
+#endif
+
void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
{
CV_Assert(inputs.size() == 1);
type == PoolingLayer::AVE && !pad.width && !pad.height);
}
+#ifdef HAVE_OPENCL
+ bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ if (poolOp.empty())
+ {
+ OCL4DNNPoolConfig config;
+
+ config.in_shape = shape(*inputs[0]);
+ config.out_shape = shape(outputs[0]);
+ config.kernel = kernel;
+ config.pad = pad;
+ config.stride = stride;
+ config.channels = inputs[0]->size[1];
+ config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
+ (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
+ LIBDNN_POOLING_METHOD_STO);
+ poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
+ }
+
+ for (size_t ii = 0; ii < inputs.size(); ii++)
+ {
+ UMat inpMat, outMat, maskMat;
+
+ inpMat = inputs[ii]->getUMat(ACCESS_READ);
+
+ if (type == MAX)
+ {
+ outMat = outputs[2 * ii].getUMat(ACCESS_WRITE);
+ maskMat = outputs[2 * ii + 1].getUMat(ACCESS_WRITE);
+ } else {
+ outMat = outputs[ii].getUMat(ACCESS_WRITE);
+ maskMat = UMat();
+ }
+
+ CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
+
+ if (!poolOp->Forward(inpMat, outMat, maskMat))
+ return false;
+ }
+
+ return true;
+ }
+#endif
+
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+ CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+ OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+ forward_ocl(inputs, outputs, internals))
+
for (size_t ii = 0; ii < inputs.size(); ii++)
{
switch (type)
#include "../precomp.hpp"
#include "layers_common.hpp"
#include "op_halide.hpp"
+#include "opencl_kernels_dnn.hpp"
#include <algorithm>
#include <stdlib.h>
using std::max;
+#ifdef HAVE_OPENCL
+using namespace cv::dnn::ocl4dnn;
+#endif
namespace cv
{
setParamsFrom(params);
}
+#ifdef HAVE_OPENCL
+ Ptr<OCL4DNNSoftmax<float> > softmaxOp;
+#endif
+
bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
backendId == DNN_BACKEND_HALIDE && haveHalide() && axisRaw == 1;
}
+#ifdef HAVE_OPENCL
+ bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
+ {
+ if (softmaxOp.empty())
+ {
+ OCL4DNNSoftmaxConfig config;
+
+ config.in_shape = shape(*inputs[0]);
+ config.axis = axisRaw;
+ config.channels = inputs[0]->size[axisRaw];
+
+ softmaxOp = Ptr<OCL4DNNSoftmax<float> >(new OCL4DNNSoftmax<float>(config));
+ }
+
+ UMat srcMat, dstMat;
+ srcMat = inputs[0]->getUMat(ACCESS_READ);
+ dstMat = outputs[0].getUMat(ACCESS_WRITE);
+
+ if (!logSoftMax && softmaxOp->Forward(srcMat, dstMat))
+ return true;
+
+ const Mat &src = *inputs[0];
+ UMat bufMat = internals[0].getUMat(ACCESS_WRITE);
+ srcMat.copyTo(dstMat);
+
+ int axis = clamp(axisRaw, src.dims);
+ size_t outerSize = src.total(0, axis);
+ size_t channels = src.size[axis];
+ size_t innerSize = src.total(axis + 1);
+
+ String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
+ ocl::Kernel kmax, ksub, ksum, kdiv;
+
+ if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
+ return false;
+
+ if (!ksub.create("kernel_channel_subtract", ocl::dnn::softmax_oclsrc, buildOpts))
+ return false;
+
+ if (!ksum.create("kernel_channel_sum", ocl::dnn::softmax_oclsrc, buildOpts))
+ return false;
+
+ if (logSoftMax) buildOpts += " -DLOG_SOFTMAX ";
+ if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
+ return false;
+
+ size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
+ size_t bufSize = internals[0].total();
+ size_t totalSize = src.total();
+
+ kmax.args((int)outerSize, (int)channels, (int)innerSize,
+ ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+ if (!kmax.run(1, &bufSize, &wgSize, false))
+ return false;
+
+ ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+ ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+ if (!ksub.run(1, &totalSize, &wgSize, false))
+ return false;
+
+ cv::exp(dstMat, dstMat);
+
+ ksum.args((int)outerSize, (int)channels, (int)innerSize,
+ ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
+ if (!ksum.run(1, &bufSize, &wgSize, false))
+ return false;
+
+ kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
+ ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
+ if (!kdiv.run(1, &totalSize, &wgSize, false))
+ return false;
+
+ return true;
+ }
+#endif
+
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
CV_TRACE_FUNCTION();
CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+ CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+ OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+ forward_ocl(inputs, outputs, internals))
+
const Mat &src = *inputs[0];
Mat &dst = outputs[0];
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _OPENCV_LIBDNN_COMMON_HPP_
+#define _OPENCV_LIBDNN_COMMON_HPP_
+#include "../../precomp.hpp"
+#include "../../caffe/glog_emulator.hpp"
+#include <opencv2/core/opencl/runtime/opencl_core.hpp>
+
+#ifdef HAVE_OPENCL
+
+// Macro to select the single (_float) or double (_double) precision kernel
+#define CL_KERNEL_SELECT(kernel) kernel "_float"
+
+#define OCL_CHECK(condition) \
+ do { \
+ cl_int error = (condition); \
+ CHECK_EQ(error, CL_SUCCESS) << " " << cv::ocl::getOpenCLErrorString(error); \
+ } while (0)
+
+bool clOptionSupport(cv::String option);
+
+#endif // HAVE_OPENCL
+#endif
--- /dev/null
+#ifndef _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_
+#define _OPENCV_OCL4DNN_DEFAULT_KERNEL_CONFIG_HPP_
+const char *default_kernel_config_intel[] = {
+// Below is the information for OpenCL based on which these configurations tuned
+/*******************************************************************************
+Number of platforms 1
+ Platform Name Intel(R) OpenCL
+ Platform Vendor Intel(R) Corporation
+ Platform Version OpenCL 2.0
+ Platform Profile FULL_PROFILE
+ Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+ Platform Extensions function suffix INTEL
+
+ Platform Name Intel(R) OpenCL
+Number of devices 1
+ Device Name Intel(R) HD Graphics
+ Device Vendor Intel(R) Corporation
+ Device Vendor ID 0x8086
+ Device Version OpenCL 2.0
+ Driver Version r4.1.61547
+ Device OpenCL C Version OpenCL C 2.0
+ Device Type GPU
+ Device Profile FULL_PROFILE
+ Max compute units 72
+ Max clock frequency 950MHz
+ Device Partition (core)
+ Max number of sub-devices 0
+ Supported partition types by <unknown> (0x7FE000000000)
+ Max work item dimensions 3
+ Max work item sizes 256x256x256
+ Max work group size 256
+ Preferred work group size multiple 32
+ Preferred / native vector sizes
+ char 16 / 16
+ short 8 / 8
+ int 4 / 4
+ long 1 / 1
+ half 8 / 8 (cl_khr_fp16)
+ float 1 / 1
+ double 1 / 1 (cl_khr_fp64)
+ Half-precision Floating-point support (cl_khr_fp16)
+ Denormals Yes
+ Infinity and NANs Yes
+ Round to nearest Yes
+ Round to zero Yes
+ Round to infinity Yes
+ IEEE754-2008 fused multiply-add Yes
+ Support is emulated in software No
+ Correctly-rounded divide and sqrt operations No
+ Single-precision Floating-point support (core)
+ Denormals Yes
+ Infinity and NANs Yes
+ Round to nearest Yes
+ Round to zero Yes
+ Round to infinity Yes
+ IEEE754-2008 fused multiply-add Yes
+ Support is emulated in software No
+ Correctly-rounded divide and sqrt operations Yes
+ Double-precision Floating-point support (cl_khr_fp64)
+ Denormals Yes
+ Infinity and NANs Yes
+ Round to nearest Yes
+ Round to zero Yes
+ Round to infinity Yes
+ IEEE754-2008 fused multiply-add Yes
+ Support is emulated in software No
+ Correctly-rounded divide and sqrt operations No
+ Address bits 64, Little-Endian
+ Global memory size 26887677543 (25.04GiB)
+ Error Correction support No
+ Max memory allocation 4294959103 (4GiB)
+ Unified memory for Host and Device Yes
+ Shared Virtual Memory (SVM) capabilities (core)
+ Coarse-grained buffer sharing Yes
+ Fine-grained buffer sharing No
+ Fine-grained system sharing No
+ Atomics No
+ Minimum alignment for any data type 128 bytes
+ Alignment of base address 1024 bits (128 bytes)
+ Preferred alignment for atomics
+ SVM 64 bytes
+ Global 64 bytes
+ Local 64 bytes
+ Max size for global variable 65536 (64KiB)
+ Preferred total size of global vars 4294959103 (4GiB)
+ Global Memory cache type Read/Write
+ Global Memory cache size 1572864
+ Global Memory cache line 64 bytes
+ Image support Yes
+ Max number of samplers per kernel 16
+ Max size for 1D images from buffer 268434943 pixels
+ Max 1D or 2D image array size 2048 images
+ Base address alignment for 2D image buffers 4 bytes
+ Pitch alignment for 2D image buffers 4 bytes
+ Max 2D image size 16384x16384 pixels
+ Max 3D image size 16384x16384x2048 pixels
+ Max number of read image args 128
+ Max number of write image args 128
+ Max number of read/write image args 128
+ Max number of pipe args 16
+ Max active pipe reservations 1
+ Max pipe packet size 1024
+ Local memory type Local
+ Local memory size 65536 (64KiB)
+ Max constant buffer size 4294959103 (4GiB)
+ Max number of constant args 8
+ Max size of kernel argument 1024
+ Queue properties (on host)
+ Out-of-order execution Yes
+ Profiling Yes
+ Queue properties (on device)
+ Out-of-order execution Yes
+ Profiling Yes
+ Preferred size 131072 (128KiB)
+ Max size 67108864 (64MiB)
+ Max queues on device 1
+ Max events on device 1024
+ Prefer user sync for interop Yes
+ Profiling timer resolution 83ns
+ Execution capabilities
+ Run OpenCL kernels Yes
+ Run native kernels No
+ SPIR versions 1.2
+ printf() buffer size 4194304 (4MiB)
+ Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel
+ Motion Estimation accelerator version (Intel) 2
+ Device Available Yes
+ Compiler Available Yes
+ Linker Available Yes
+ Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+
+NULL platform behavior
+ clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform
+ clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) No platform
+ clCreateContext(NULL, ...) [default] No platform
+ clCreateContext(NULL, ...) [other] Success [INTEL]
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform
+********************************************************************************/
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","12 2 8 2 1 1 8 1 0 ",
+"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192","2 7 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48","4 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU72_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64","4 1 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","4 6 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4","14 1 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4","4 4 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208","2 6 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","1 8 32 5 1 8 1 1 0 ",
+"EU72_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32","4 6 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn16_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M4","12 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn32_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ",
+"EU72_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32","3 6 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384","2 7 16 2 1 1 16 1 0 ",
+"EU72_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4","1 1 1 4 1 1 1 0 1 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96","4 5 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192","10 2 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192","6 4 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn4_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96","8 3 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32","8 1 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384","4 7 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256","2 6 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 4 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48","4 3 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5","2 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24","8 2 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b0_in32x32_p1x1_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128","2 7 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M32","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112","8 2 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","8 2 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn16_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224","2 7 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","4 6 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96","4 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192","10 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","12 2 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn1024_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M1024","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn2048_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M512","4 6 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn512_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M512","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16","8 2 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","8 3 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288","2 7 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16","2 5 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M4","8 3 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256","2 7 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn256_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M256","2 5 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k2x2_cn16_g1_s2x2_d1x1_b0_in256x256_p0x0_num1_M16","6 4 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192","2 5 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 2 8 2 1 1 8 1 0 ",
+"EU72_k2x2_cn64_g1_s2x2_d1x1_b0_in128x128_p0x0_num1_M32","8 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M256","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","12 2 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16","12 1 8 2 1 1 8 1 0 ",
+"EU72_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","4 7 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 5 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16","12 1 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M512","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","12 1 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","12 2 8 2 1 1 8 1 0 ",
+"EU72_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ",
+"EU72_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24","12 1 8 2 1 1 8 1 0 ",
+"EU72_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 2 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13","1 1 1 4 1 1 1 0 1 ",
+"EU72_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32","6 4 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn64_g1_s1x1_d1x1_b0_in64x64_p1x1_num1_M64","2 7 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M1024","2 8 32 5 1 8 1 1 0 ",
+"EU72_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32","4 6 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU72_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","4 6 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","12 2 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128","2 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU72_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112","1 8 32 5 1 8 1 1 0 ",
+"EU72_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2","1 3 16 2 1 1 16 1 0 ",
+"EU72_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M2048","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU72_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU72_k1x1_cn512_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M2048","1 8 32 5 1 8 1 1 0 ",
+"EU72_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","8 1 16 2 1 1 16 1 0 ",
+"EU72_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208","2 7 16 2 1 1 16 1 0 ",
+// Below is the information for OpenCL based on which these configurations tuned
+/*******************************************************************************
+Number of platforms 1
+ Platform Name Intel(R) OpenCL
+ Platform Vendor Intel(R) Corporation
+ Platform Version OpenCL 2.0
+ Platform Profile FULL_PROFILE
+ Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_driver_diagnostics cl_intel_motion_estimation cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+ Platform Extensions function suffix INTEL
+
+ Platform Name Intel(R) OpenCL
+Number of devices 1
+ Device Name Intel(R) HD Graphics
+ Device Vendor Intel(R) Corporation
+ Device Vendor ID 0x8086
+ Device Version OpenCL 2.0
+ Driver Version 16.5.56875
+ Device OpenCL C Version OpenCL C 2.0 ( using IGC )
+ Device Type GPU
+ Device Profile FULL_PROFILE
+ Max compute units 48
+ Max clock frequency 950MHz
+ Device Partition (core)
+ Max number of sub-devices 0
+ Supported partition types by <unknown> (0x7F4B00000000)
+ Max work item dimensions 3
+ Max work item sizes 256x256x256
+ Max work group size 256
+ Preferred work group size multiple 32
+ Preferred / native vector sizes
+ char 16 / 16
+ short 8 / 8
+ int 4 / 4
+ long 1 / 1
+ half 8 / 8 (cl_khr_fp16)
+ float 1 / 1
+ double 1 / 1 (cl_khr_fp64)
+ Half-precision Floating-point support (cl_khr_fp16)
+ Denormals Yes
+ Infinity and NANs Yes
+ Round to nearest Yes
+ Round to zero Yes
+ Round to infinity Yes
+ IEEE754-2008 fused multiply-add Yes
+ Support is emulated in software No
+ Correctly-rounded divide and sqrt operations No
+ Single-precision Floating-point support (core)
+ Denormals Yes
+ Infinity and NANs Yes
+ Round to nearest Yes
+ Round to zero Yes
+ Round to infinity Yes
+ IEEE754-2008 fused multiply-add Yes
+ Support is emulated in software No
+ Correctly-rounded divide and sqrt operations Yes
+ Double-precision Floating-point support (cl_khr_fp64)
+ Denormals Yes
+ Infinity and NANs Yes
+ Round to nearest Yes
+ Round to zero Yes
+ Round to infinity Yes
+ IEEE754-2008 fused multiply-add Yes
+ Support is emulated in software No
+ Correctly-rounded divide and sqrt operations No
+ Address bits 64, Little-Endian
+ Global memory size 13361912218 (12.44GiB)
+ Error Correction support No
+ Max memory allocation 4294959103 (4GiB)
+ Unified memory for Host and Device Yes
+ Shared Virtual Memory (SVM) capabilities (core)
+ Coarse-grained buffer sharing Yes
+ Fine-grained buffer sharing No
+ Fine-grained system sharing No
+ Atomics No
+ Minimum alignment for any data type 128 bytes
+ Alignment of base address 1024 bits (128 bytes)
+ Preferred alignment for atomics
+ SVM 64 bytes
+ Global 64 bytes
+ Local 64 bytes
+ Max size for global variable 65536 (64KiB)
+ Preferred total size of global vars 4294959103 (4GiB)
+ Global Memory cache type Read/Write
+ Global Memory cache size 1048576
+ Global Memory cache line 64 bytes
+ Image support Yes
+ Max number of samplers per kernel 16
+ Max size for 1D images from buffer 268434943 pixels
+ Max 1D or 2D image array size 2048 images
+ Base address alignment for 2D image buffers 4 bytes
+ Pitch alignment for 2D image buffers 4 bytes
+ Max 2D image size 16384x16384 pixels
+ Max 3D image size 16384x16384x2048 pixels
+ Max number of read image args 128
+ Max number of write image args 128
+ Max number of read/write image args 128
+ Max number of pipe args 16
+ Max active pipe reservations 1
+ Max pipe packet size 1024
+ Local memory type Local
+ Local memory size 65536 (64KiB)
+ Max constant buffer size 4294959103 (4GiB)
+ Max number of constant args 8
+ Max size of kernel argument 1024
+ Queue properties (on host)
+ Out-of-order execution Yes
+ Profiling Yes
+ Queue properties (on device)
+ Out-of-order execution Yes
+ Profiling Yes
+ Preferred size 131072 (128KiB)
+ Max size 67108864 (64MiB)
+ Max queues on device 1
+ Max events on device 1024
+ Prefer user sync for interop Yes
+ Profiling timer resolution 83ns
+ Execution capabilities
+ Run OpenCL kernels Yes
+ Run native kernels No
+ SPIR versions 1.2
+ printf() buffer size 4194304 (4MiB)
+ Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel
+ Motion Estimation accelerator version (Intel) 2
+ Device Available Yes
+ Compiler Available Yes
+ Linker Available Yes
+ Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_driver_diagnostics cl_intel_motion_estimation cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+
+NULL platform behavior
+ clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform
+ clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) No platform
+ clCreateContext(NULL, ...) [default] No platform
+ clCreateContext(NULL, ...) [other] Success [INTEL]
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform
+********************************************************************************/
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn32_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ",
+"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32","8 1 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96","1 16 32 5 1 16 1 1 0 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b0_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128","2 8 32 5 1 8 1 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","8 1 16 2 1 1 16 1 0 ",
+"EU48_k2x2_cn16_g1_s2x2_d1x1_b0_in256x256_p0x0_num1_M16","2 7 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4","6 4 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112","8 3 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn512_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M512","2 7 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16","8 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","4 7 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320","2 7 16 2 1 1 16 1 0 ",
+"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48","4 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","2 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192","2 8 16 2 1 1 16 1 0 ",
+"EU48_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","12 1 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","12 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","8 2 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192","2 7 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256","2 5 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn16_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M4","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32","4 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","4 7 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13","1 1 1 4 1 1 1 0 1 ",
+"EU48_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64","4 1 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M16","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32","3 3 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96","2 8 32 5 1 8 1 1 0 ",
+"EU48_k2x2_cn64_g1_s2x2_d1x1_b0_in128x128_p0x0_num1_M32","4 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","4 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M128","2 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16","2 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn4_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128","6 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4","4 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M2048","1 16 32 5 1 16 1 1 0 ",
+"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn16_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","4 7 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192","2 5 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","12 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn2048_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M512","4 7 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","12 2 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 4 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288","2 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48","4 6 8 2 1 1 8 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","8 1 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","12 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","4 6 8 2 1 1 8 1 0 ",
+"EU48_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128","4 5 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn256_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M256","2 6 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48","4 2 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn64_g1_s1x1_d1x1_b0_in64x64_p1x1_num1_M64","10 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","4 5 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208","2 5 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M2048","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M256","1 16 32 5 1 16 1 1 0 ",
+"EU48_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224","2 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288","2 7 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192","2 7 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32","4 3 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96","4 2 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208","2 5 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96","4 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24","12 1 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M16","4 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn1024_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320","2 8 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192","6 4 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 3 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU48_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5","2 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64","1 16 32 5 1 16 1 1 0 ",
+"EU48_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","8 2 16 2 1 1 16 1 0 ",
+"EU48_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","4 6 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M32","1 16 32 5 1 16 1 1 0 ",
+"EU48_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","12 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4","1 1 1 4 1 1 1 0 1 ",
+"EU48_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256","2 7 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24","8 2 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16","12 1 8 2 1 1 8 1 0 ",
+"EU48_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128","10 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32","1 16 32 5 1 16 1 1 0 ",
+"EU48_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","4 7 8 2 1 1 8 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16","12 2 8 2 1 1 8 1 0 ",
+"EU48_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2","1 4 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M4","8 2 8 2 1 1 8 1 0 ",
+"EU48_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU48_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16","1 8 32 5 1 8 1 1 0 ",
+"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 2 16 2 1 1 16 1 0 ",
+"EU48_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32","2 8 16 2 1 1 16 1 0 ",
+"EU48_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 2 16 2 1 1 16 1 0 ",
+// Below is the information for OpenCL based on which these configurations tuned
+/*******************************************************************************
+Number of platforms 1
+ Platform Name Intel(R) OpenCL
+ Platform Vendor Intel(R) Corporation
+ Platform Version OpenCL 2.0
+ Platform Profile FULL_PROFILE
+ Platform Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+ Platform Extensions function suffix INTEL
+
+ Platform Name Intel(R) OpenCL
+Number of devices 1
+ Device Name Intel(R) HD Graphics
+ Device Vendor Intel(R) Corporation
+ Device Vendor ID 0x8086
+ Device Version OpenCL 2.0
+ Driver Version 16.5.59288
+ Device OpenCL C Version OpenCL C 2.0
+ Device Type GPU
+ Device Profile FULL_PROFILE
+ Max compute units 24
+ Max clock frequency 1050MHz
+ Device Partition (core)
+ Max number of sub-devices 0
+ Supported partition types by <unknown> (0x7F5100000000)
+ Max work item dimensions 3
+ Max work item sizes 256x256x256
+ Max work group size 256
+ Preferred work group size multiple 32
+ Preferred / native vector sizes
+ char 16 / 16
+ short 8 / 8
+ int 4 / 4
+ long 1 / 1
+ half 8 / 8 (cl_khr_fp16)
+ float 1 / 1
+ double 1 / 1 (cl_khr_fp64)
+ Half-precision Floating-point support (cl_khr_fp16)
+ Denormals Yes
+ Infinity and NANs Yes
+ Round to nearest Yes
+ Round to zero Yes
+ Round to infinity Yes
+ IEEE754-2008 fused multiply-add Yes
+ Support is emulated in software No
+ Correctly-rounded divide and sqrt operations No
+ Single-precision Floating-point support (core)
+ Denormals Yes
+ Infinity and NANs Yes
+ Round to nearest Yes
+ Round to zero Yes
+ Round to infinity Yes
+ IEEE754-2008 fused multiply-add Yes
+ Support is emulated in software No
+ Correctly-rounded divide and sqrt operations Yes
+ Double-precision Floating-point support (cl_khr_fp64)
+ Denormals Yes
+ Infinity and NANs Yes
+ Round to nearest Yes
+ Round to zero Yes
+ Round to infinity Yes
+ IEEE754-2008 fused multiply-add Yes
+ Support is emulated in software No
+ Correctly-rounded divide and sqrt operations No
+ Address bits 64, Little-Endian
+ Global memory size 6588802663 (6.136GiB)
+ Error Correction support No
+ Max memory allocation 3294401331 (3.068GiB)
+ Unified memory for Host and Device Yes
+ Shared Virtual Memory (SVM) capabilities (core)
+ Coarse-grained buffer sharing Yes
+ Fine-grained buffer sharing No
+ Fine-grained system sharing No
+ Atomics No
+ Minimum alignment for any data type 128 bytes
+ Alignment of base address 1024 bits (128 bytes)
+ Preferred alignment for atomics
+ SVM 64 bytes
+ Global 64 bytes
+ Local 64 bytes
+ Max size for global variable 65536 (64KiB)
+ Preferred total size of global vars 3294401331 (3.068GiB)
+ Global Memory cache type Read/Write
+ Global Memory cache size 524288
+ Global Memory cache line 64 bytes
+ Image support Yes
+ Max number of samplers per kernel 16
+ Max size for 1D images from buffer 205900083 pixels
+ Max 1D or 2D image array size 2048 images
+ Base address alignment for 2D image buffers 4 bytes
+ Pitch alignment for 2D image buffers 4 bytes
+ Max 2D image size 16384x16384 pixels
+ Max 3D image size 16384x16384x2048 pixels
+ Max number of read image args 128
+ Max number of write image args 128
+ Max number of read/write image args 128
+ Max number of pipe args 16
+ Max active pipe reservations 1
+ Max pipe packet size 1024
+ Local memory type Local
+ Local memory size 65536 (64KiB)
+ Max constant buffer size 3294401331 (3.068GiB)
+ Max number of constant args 8
+ Max size of kernel argument 1024
+ Queue properties (on host)
+ Out-of-order execution Yes
+ Profiling Yes
+ Queue properties (on device)
+ Out-of-order execution Yes
+ Profiling Yes
+ Preferred size 131072 (128KiB)
+ Max size 67108864 (64MiB)
+ Max queues on device 1
+ Max events on device 1024
+ Prefer user sync for interop Yes
+ Profiling timer resolution 83ns
+ Execution capabilities
+ Run OpenCL kernels Yes
+ Run native kernels No
+ SPIR versions 1.2
+ printf() buffer size 4194304 (4MiB)
+ Built-in kernels block_motion_estimate_intel;block_advanced_motion_estimate_check_intel;block_advanced_motion_estimate_bidirectional_check_intel
+ Motion Estimation accelerator version (Intel) 2
+ Device Available Yes
+ Compiler Available Yes
+ Linker Available Yes
+ Device Extensions cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_driver_diagnostics cl_intel_media_block_io cl_intel_motion_estimation cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_required_subgroup_size cl_intel_subgroups cl_intel_subgroups_short cl_intel_va_api_media_sharing cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_depth_images cl_khr_fp16 cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_khr_spir cl_khr_subgroups
+
+NULL platform behavior
+ clGetPlatformInfo(NULL, CL_PLATFORM_NAME, ...) No platform
+ clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, ...) No platform
+ clCreateContext(NULL, ...) [default] No platform
+ clCreateContext(NULL, ...) [other] Success [INTEL]
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_ACCELERATOR) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_CUSTOM) No platform
+ clCreateContextFromType(NULL, CL_DEVICE_TYPE_ALL) No platform
+********************************************************************************/
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","2 8 32 5 1 8 1 1 0 ",
+"EU24_k5x1_cn32_g1_s1x1_d1x1_b0_in64x64_p2x0_num1_M32","4 6 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 2 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M224","2 5 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k2x2_cn16_g1_s2x2_d1x1_b0_in256x256_p0x0_num1_M16","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn256_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn2048_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M16","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M128","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn112_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M224","2 7 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d8x8_b1_in64x64_p8x8_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M208","2 7 16 2 1 1 16 1 0 ",
+"EU24_k11x11_cn3_g1_s4x4_d1x1_b1_in224x224_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d2x2_b1_in64x64_p2x2_num1_M32","3 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M24","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b0_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M144","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn1024_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M208","2 7 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M48","4 2 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M2048","4 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M192","6 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b0_in16x16_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn32_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M128","1 16 32 5 1 16 1 1 0 ",
+"EU24_k1x1_cn4_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M16","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn192_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M384","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","4 6 8 2 1 1 8 1 0 ",
+"EU24_k5x5_cn48_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M128","4 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M2048","1 16 32 5 1 16 1 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M384","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x4_cn6_g3_s3x2_d1x1_b1_in128x80_p1x0_num2_M4","1 1 1 4 1 1 1 0 1 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M192","6 4 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn256_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M256","2 7 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M320","2 8 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M192","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M256","2 5 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num2_M64","4 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M16","8 3 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M112","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M16","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M256","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M32","4 2 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M96","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn16_g1_s1x1_d1x1_b1_in128x128_p1x1_num1_M16","6 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M112","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num2_M96","4 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M288","2 8 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn144_g1_s1x1_d1x1_b1_in16x16_p1x1_num1_M288","2 7 16 2 1 1 16 1 0 ",
+"EU24_k7x7_cn3_g1_s2x2_d1x1_b1_in224x224_p3x3_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn4_g1_s1x1_d1x1_b1_in256x256_p1x1_num1_M4","10 2 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d16x16_b1_in64x64_p16x16_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M16","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU24_k1x5_cn32_g1_s1x1_d1x1_b1_in64x64_p0x2_num1_M32","4 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn384_g2_s1x1_d1x1_b1_in16x16_p1x1_num1_M192","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M32","4 6 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","4 6 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d4x4_b1_in64x64_p4x4_num1_M32","1 8 32 5 1 8 1 1 0 ",
+"EU24_k2x2_cn64_g1_s2x2_d1x1_b0_in128x128_p0x0_num1_M32","2 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn96_g2_s1x1_d1x1_b1_in32x32_p2x2_num1_M128","4 3 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M48","8 1 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn16_g1_s1x1_d1x1_b0_in256x256_p0x0_num1_M4","8 3 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M256","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M144","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M128","6 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in32x32_p1x1_num1_M192","2 7 16 2 1 1 16 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k5x5_cn32_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M96","4 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","4 6 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M32","2 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn32_g1_s1x1_d1x1_b1_in64x64_p1x1_num1_M32","2 8 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn96_g1_s1x1_d1x1_b1_in32x32_p1x1_num2_M128","10 2 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn160_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M320","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M32","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn64_g1_s1x1_d1x1_b0_in64x64_p1x1_num1_M64","2 8 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M5","2 3 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn16_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M48","4 6 8 2 1 1 8 1 0 ",
+"EU24_k5x5_cn24_g1_s1x1_d1x1_b1_in16x16_p2x2_num1_M64","4 2 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in128x128_p0x0_num1_M4","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 2 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M96","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b0_in64x64_p0x0_num1_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M192","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M48","4 6 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn128_g1_s1x1_d1x1_b1_in16x16_p1x1_num2_M256","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M4","4 4 16 2 1 1 16 1 0 ",
+"EU24_k4x4_cn3_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M2","1 3 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M96","1 8 32 5 1 8 1 1 0 ",
+"EU24_k3x3_cn512_g1_s1x1_d1x1_b0_in16x16_p1x1_num1_M512","2 7 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s2x2_d1x1_b0_in32x32_p0x0_num1_M1024","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k11x7_cn3_g1_s3x4_d1x1_b1_in64x64_p3x2_num1_M64","4 1 16 2 1 1 16 1 0 ",
+"EU24_k3x3_cn64_g1_s1x1_d1x1_b1_in64x64_p1x1_num2_M192","6 4 16 2 1 1 16 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M64","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn64_g1_s1x1_d1x1_b1_in64x64_p0x0_num1_M64","1 16 32 5 1 16 1 1 0 ",
+"EU24_k1x1_cn192_g1_s1x1_d1x1_b1_in32x32_p0x0_num1_M16","8 3 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn128_g1_s1x1_d1x1_b0_in32x32_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn1024_g1_s2x2_d1x1_b0_in16x16_p0x0_num1_M512","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M128","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn832_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M384","4 7 8 2 1 1 8 1 0 ",
+"EU24_k1x1_cn528_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M160","1 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn480_g1_s1x1_d1x1_b1_in16x16_p0x0_num1_M64","8 3 8 2 1 1 8 1 0 ",
+"EU24_k3x3_cn3_g1_s2x2_d1x1_b1_in256x256_p1x1_num1_M13","1 1 1 4 1 1 1 0 1 ",
+"EU24_k1x1_cn256_g1_s2x2_d1x1_b0_in64x64_p0x0_num1_M512","2 8 32 5 1 8 1 1 0 ",
+"EU24_k1x1_cn512_g1_s1x1_d1x1_b1_in16x16_p0x0_num2_M24","8 3 8 2 1 1 8 1 0 ",
+"EU24_k5x5_cn16_g1_s1x1_d1x1_b1_in32x32_p2x2_num1_M32","4 3 16 2 1 1 16 1 0 ",
+};
+#endif
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _OPENCV_GREENTEA_MATH_FUNCTIONS_HPP_
+#define _OPENCV_GREENTEA_MATH_FUNCTIONS_HPP_
+#include "../../precomp.hpp"
+#include "common.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+namespace ocl4dnn
+{
+
+#ifdef HAVE_OPENCL
+enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
+
+template<typename Dtype>
+bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
+ const int32_t M, const int32_t N, const int32_t K,
+ const UMat A, const UMat B,
+ const UMat B_image, UMat C,
+ const size_t max_image_size);
+
+template<typename Dtype>
+ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
+ bool is_matrix_a, bool transpose,
+ bool padding, int padded_height,
+ int padded_width, int height,
+ int width, int ld);
+
+template<typename Dtype>
+bool ocl4dnnGEMV(const CBLAS_TRANSPOSE TransA,
+ const int32_t M, const int32_t N, const Dtype alpha,
+ const UMat A, const int32_t offA, const UMat x,
+ const int32_t offx, const Dtype beta, UMat y,
+ const int32_t offy);
+
+template<typename Dtype>
+bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
+ const UMat x, const int32_t offx, UMat y,
+ const int32_t offy);
+
+#endif // HAVE_OPENCL
+
+} // namespace ocl4dnn
+} // namespace dnn
+} // namespce cv
+
+#endif
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _OPENCV_LIBDNN_HPP_
+#define _OPENCV_LIBDNN_HPP_
+#include "../../precomp.hpp"
+#include <iomanip>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "common.hpp"
+
+namespace cv { namespace dnn { namespace ocl4dnn {
+#ifdef HAVE_OPENCL
+
+struct OCL4DNNConvConfig
+{
+ OCL4DNNConvConfig() :
+ kernel(1, 1),
+ pad(0, 0),
+ stride(1, 1),
+ dilation(1, 1),
+ group(1),
+ bias_term(false)
+ {}
+ MatShape in_shape;
+ MatShape out_shape;
+ Size kernel;
+ Size pad;
+ Size stride;
+ Size dilation;
+ int group; // = 1;
+ bool bias_term; // = false;
+};
+
+
+template<typename Dtype>
+class OCL4DNNConvSpatial
+{
+ public:
+ explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);
+ ~OCL4DNNConvSpatial();
+ bool Forward(const UMat& bottom_data, const UMat& weight,
+ const UMat& bias,
+ UMat& top_data, int32_t batch_size);
+
+ private:
+ struct kernelConfig
+ {
+ std::string kernelName;
+ float executionTime;
+ size_t local_work_size[3];
+ size_t global_work_size[3];
+ int32_t workItem_output[3];
+ bool verified;
+ bool tested;
+ bool swizzle_weights;
+ bool use_null_local;
+ int32_t kernelType;
+
+ kernelConfig()
+ {}
+
+ kernelConfig(const std::string& name, const size_t* global_size, const size_t* local_size,
+ const int32_t* workItem,
+ bool swizzle,
+ int32_t type = 0)
+ : executionTime(0)
+ {
+ kernelName = name;
+ for (int32_t x = 0; x < 3; x++)
+ {
+ local_work_size[x] = local_size ? local_size[x] : 1;
+ global_work_size[x] = global_size[x];
+ workItem_output[x] = workItem[x];
+ }
+ swizzle_weights = swizzle;
+ use_null_local = local_size == NULL;
+ verified = false;
+ tested = false;
+ kernelType = type;
+ }
+ };
+
+ struct tunerParam
+ {
+ int kernelType;
+ int blockWidth;
+ int blockHeight;
+ int blockDepth;
+
+ tunerParam(int type, int w, int h, int d)
+ {
+ kernelType = type;
+ blockWidth = w;
+ blockHeight= h;
+ blockDepth = d;
+ }
+ };
+
+ inline void addDef(const char* name)
+ {
+ options_ << " -D " << name;
+ }
+
+ inline void addDef(const char* name, const int value)
+ {
+ options_ << " -D " << name << "=" << value;
+ }
+
+ inline void addDef(const char* name, const float value)
+ {
+ options_ << " -D " << name << "=(float)" << value;
+ }
+
+ inline void addDef(const char* name, const double value)
+ {
+ options_ << " -D " << name << "=(double)" << value;
+ }
+
+ inline void addDef(const char* name, const char* value)
+ {
+ options_ << " -D " << name << "=" << value;
+ }
+
+ void useFirstAvailable(const UMat &bottom,
+ UMat &top,
+ const UMat &weight,
+ const UMat &bias,
+ int32_t numImages,
+ UMat &verifyTop);
+ void setupKernel();
+ void collectCommonInformation();
+ void setupKernelDetails(int32_t kernelType,
+ int32_t blockM,
+ int32_t blockK,
+ int32_t blockN);
+
+ ocl::Program compileKernel();
+ typedef std::map<std::string, ocl::Program> phash_t;
+ phash_t phash;
+ void calculateBenchmark(const UMat &bottom, UMat &verifyTop,
+ const UMat &weight, const UMat &bias,
+ int32_t numImages);
+
+
+ void setupConvolution(const UMat &bottom,
+ UMat &top,
+ const UMat &weight,
+ const UMat &bias,
+ int32_t numImags,
+ UMat &verifyTop);
+ bool createConvolutionKernel(int32_t kernelType,
+ int32_t blockWidth,
+ int32_t blockHeight,
+ int32_t blockDepth);
+ bool setupIDLF(int32_t blockWidth,
+ int32_t blockHeight,
+ int32_t blockDepth);
+ bool createBasicKernel(int32_t blockWidth,
+ int32_t blockHeight,
+ int32_t blockDepth);
+ bool createGEMMLikeConvKernel(int32_t blockWidth,
+ int32_t blockHeight,
+ int32_t blockDepth);
+ void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
+ int32_t offset, int32_t size, bool write_only);
+ bool convolve(const UMat &bottom, UMat &top,
+ const UMat &weight, const UMat &bias,
+ int32_t numImages,
+ kernelConfig* config,
+ const cv::ocl::Queue& queue);
+ float timedConvolve(const UMat &bottom, UMat &top,
+ const UMat &weight, const UMat &bias,
+ int32_t numImages, kernelConfig* config);
+
+ bool verifyResult(const UMat &bottom,
+ UMat &top,
+ const UMat &weight,
+ const UMat &bias,
+ int32_t numImages,
+ kernelConfig* config,
+ UMat &verifyTop);
+
+ bool swizzleWeight(const UMat &weight,
+ int32_t swizzled_factor,
+ bool interleave = false);
+
+ void generateKey();
+ std::string generateSpecificKey(int32_t type, int32_t blockWidth,
+ int32_t blockHeight,
+ int32_t blockDepth);
+ void cacheTunedConfig();
+ bool loadTunedConfig();
+
+ void saveTunedConfig();
+ bool loadCachedConfig();
+
+ void unloadProgram(const std::string& kernelName);
+ void prepareKernel(const UMat &bottom, UMat &top,
+ const UMat &weight, const UMat &bias,
+ int32_t numImages);
+ bool setupKernelByConfig(int x, int y, int z, int type,
+ int lx, int ly, int lz,
+ bool swizzle, bool nullLocal);
+ void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
+
+ int32_t group_;
+ bool bias_term_;
+ UMat swizzled_weights_umat;
+
+ int32_t bottom_index_;
+ int32_t output_h_;
+ int32_t output_w_;
+ int32_t kernel_h_;
+ int32_t kernel_w_;
+ int32_t height_;
+ int32_t width_;
+ int32_t pad_h_;
+ int32_t pad_w_;
+ int32_t stride_h_;
+ int32_t stride_w_;
+ int32_t dilation_h_;
+ int32_t dilation_w_;
+
+ /// M_ is the channel dimension of the output for a single group, which is the
+ /// leading dimension of the filter matrix.
+ int32_t M_;
+
+ bool tuned_;
+ std::string key_, key_sanitized_;
+ std::string short_key_;
+ std::string kernel_name_;
+ std::string cache_path_;
+ bool use_cache_path_; // true if cache_path_ directory exists
+ bool force_auto_tuning_;
+ int32_t kernel_index_;
+ std::vector< cv::Ptr<kernelConfig> > kernelQueue;
+ cv::Ptr<kernelConfig> bestKernelConfig;
+
+ int32_t bottom_dim_;
+ int32_t top_dim_;
+ int32_t num_;
+ int32_t channels_;
+ int32_t num_output_;
+
+ int32_t kernelType_;
+ int32_t blockM_;
+ int32_t blockK_;
+ int32_t blockN_;
+ std::stringstream options_;
+ cv::ocl::ProgramSource src_;
+ int32_t prev_kernel_type_;
+};
+
+typedef enum {
+ LIBDNN_POOLING_METHOD_MAX = 0,
+ LIBDNN_POOLING_METHOD_AVE = 1,
+ LIBDNN_POOLING_METHOD_STO = 2
+} ocl4dnnPoolingMethod_t;
+
+struct OCL4DNNPoolConfig
+{
+ OCL4DNNPoolConfig() :
+ kernel(1, 1),
+ pad(0, 0),
+ stride(1, 1),
+ dilation(1, 1),
+ channels(0),
+ pool_method(LIBDNN_POOLING_METHOD_MAX),
+ global_pooling(false)
+ {}
+ MatShape in_shape;
+ MatShape out_shape;
+ Size kernel;
+ Size pad;
+ Size stride;
+ Size dilation;
+
+ int channels;
+ ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
+ bool global_pooling; // = false;
+};
+
+template<typename Dtype>
+class OCL4DNNPool
+{
+ public:
+ explicit OCL4DNNPool(OCL4DNNPoolConfig config);
+ ~OCL4DNNPool();
+ bool Forward(const UMat& bottom_data,
+ UMat& top_data,
+ UMat& top_mask);
+ private:
+ UMat mask_idx_;
+
+ // Pooling parameters
+ std::vector<int32_t> pad_;
+ std::vector<int32_t> stride_;
+ std::vector<int32_t> kernel_shape_;
+ std::vector<int32_t> im_in_shape_;
+ std::vector<int32_t> im_out_shape_;
+
+ ocl4dnnPoolingMethod_t pool_method_;
+ int32_t count_;
+ int32_t batch_size_;
+ int32_t channels_;
+ int32_t kernel_h_;
+ int32_t kernel_w_;
+ int32_t stride_h_;
+ int32_t stride_w_;
+ int32_t pad_h_;
+ int32_t pad_w_;
+ int32_t height_;
+ int32_t width_;
+ int32_t pooled_height_;
+ int32_t pooled_width_;
+};
+
+struct OCL4DNNInnerProductConfig
+{
+ OCL4DNNInnerProductConfig() :
+ num_output(0), M(0), K(0),
+ bias_term(false), transpose(false), phase_test(true)
+ {}
+ int num_output;
+ int M;
+ int K;
+ bool bias_term;
+ bool transpose; // = false;
+ bool phase_test; // = true;
+};
+
+template<typename Dtype>
+class OCL4DNNInnerProduct
+{
+ public:
+ explicit OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config);
+ ~OCL4DNNInnerProduct();
+ bool Forward(const UMat& bottom_data,
+ const UMat& weight,
+ const UMat& bias,
+ UMat& top_data);
+ private:
+ OCL4DNNInnerProductConfig config_;
+ int32_t axis_;
+ int32_t num_output_;
+ int32_t M_;
+ int32_t N_;
+ int32_t K_;
+ bool bias_term_;
+ bool transpose_;
+ bool image_copied_;
+ bool phase_test_;
+};
+
+typedef enum {
+ LRNParameter_NormRegion_ACROSS_CHANNELS = 0,
+ LRNParameter_NormRegion_WITHIN_CHANNEL = 1
+} LRNParameter_NormRegion_WITHIN_CHANNEL_t;
+
+struct OCL4DNNLRNConfig
+{
+ OCL4DNNLRNConfig() :
+ phase_test(true)
+ {}
+ MatShape in_shape;
+ LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
+ bool phase_test; // = true;
+ int local_size;
+ float alpha;
+ float beta;
+ float k;
+ bool norm_by_size;
+ int32_t batch_size;
+ int32_t channels;
+ int32_t height;
+ int32_t width;
+};
+
+template<typename Dtype>
+class OCL4DNNLRN
+{
+ public:
+ explicit OCL4DNNLRN(OCL4DNNLRNConfig config);
+ bool Forward(const UMat& bottom_data, UMat& top_data);
+
+ private:
+ bool crossChannelForward(const UMat& bottom_data, UMat& top_data);
+ LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type_;
+ bool phase_test_;
+ int32_t size_;
+ Dtype alpha_;
+ Dtype beta_;
+ Dtype k_;
+ int32_t num_;
+ int32_t channels_;
+ int32_t height_;
+ int32_t width_;
+ bool norm_by_size_;
+};
+
+struct OCL4DNNSoftmaxConfig
+{
+ OCL4DNNSoftmaxConfig()
+ {}
+ MatShape in_shape;
+ int axis;
+ int channels;
+};
+
+template<typename Dtype>
+class OCL4DNNSoftmax
+{
+ public:
+ explicit OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config);
+ ~OCL4DNNSoftmax();
+ bool Forward(const UMat& bottom_data, UMat& top_data);
+
+ private:
+ int32_t softmax_axis_;
+ int32_t inner_num_;
+ int32_t outer_num_;
+ int32_t channels_;
+ int32_t count_;
+ bool use_slm_;
+ UMat scale_data_;
+};
+#endif // HAVE_OPENCL
+} // namespace ocl4dnn
+} // namespace dnn
+} // namespce cv
+#endif
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include "common.hpp"
+#include "opencl_kernels_dnn.hpp"
+
+using namespace cv;
+
+#ifdef HAVE_OPENCL
+bool clOptionSupport(cv::String option)
+{
+ cv::String errmsg;
+ ocl::Program program = ocl::Context::getDefault().getProg(ocl::dnn::dummy_oclsrc, option, errmsg);
+ return program.ptr() ? true : false;
+}
+
+#endif // HAVE_OPENCL
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include "common.hpp"
+#include "math_functions.hpp"
+#include <vector>
+#include "opencl_kernels_dnn.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+namespace ocl4dnn
+{
+
+#ifdef HAVE_OPENCL
+// Create and copy buffer to image for GEMM's matrix A and B.
+// Will return image to caller if the input image is NULL. Otherwise,
+// will use the image directly. It's caller's responsibility to
+// release the created image.
+template<typename Dtype>
+ocl::Image2D ocl4dnnGEMMCopyBufferToImage(UMat buffer, int offset,
+ bool is_matrix_a, bool transpose,
+ bool padding, int padded_height,
+ int padded_width, int height,
+ int width, int ld)
+{
+ ocl::Context ctx = ocl::Context::getDefault();
+ ocl::Queue queue = ocl::Queue::getDefault();
+ ocl::Image2D image;
+
+ if (!is_matrix_a && transpose)
+ {
+ if (ld == width)
+ {
+ image = ocl::Image2D(buffer);
+ } else {
+ // For matrix B with transpose, we need to handle them differently.
+ // As we can't use the sub group block read to get a row easily,
+ // we have to use CL_FLOAT type with read_imagef to get the row.
+ UMat mat(height, width, CV_32FC1);
+ image = ocl::Image2D(mat);
+
+ ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_transpose_float", ocl::dnn::gemm_image_oclsrc);
+
+ size_t global_copy[2];
+ global_copy[0] = width;
+ global_copy[1] = height;
+ oclk_gemm_copy.set(0, ocl::KernelArg::PtrReadOnly(buffer));
+ oclk_gemm_copy.set(1, image);
+ oclk_gemm_copy.set(2, offset);
+ oclk_gemm_copy.set(3, width);
+ oclk_gemm_copy.set(4, height);
+ oclk_gemm_copy.set(5, ld);
+ oclk_gemm_copy.run(2, global_copy, NULL, false);
+ }
+ } else {
+ if (!padding)
+ {
+ // copy without padding.
+ image = ocl::Image2D(buffer);
+ } else {
+ UMat mat(padded_height, padded_width, CV_8UC4);
+ image = ocl::Image2D(mat);
+
+ ocl::Kernel oclk_gemm_copy("gemm_buffer_copy_image_no_transpose_float",
+ ocl::dnn::gemm_image_oclsrc);
+
+ size_t global_copy[2];
+ global_copy[0] = padded_width;
+ global_copy[1] = padded_height;
+
+ oclk_gemm_copy.set(0, ocl::KernelArg::PtrReadOnly(buffer));
+ oclk_gemm_copy.set(1, image);
+ oclk_gemm_copy.set(2, offset);
+ oclk_gemm_copy.set(3, width);
+ oclk_gemm_copy.set(4, height);
+ oclk_gemm_copy.set(5, ld);
+
+ oclk_gemm_copy.run(2, global_copy, NULL, false);
+ }
+ }
+
+ return image;
+}
+
+template
+ocl::Image2D ocl4dnnGEMMCopyBufferToImage<float>(UMat buffer, int offset,
+ bool is_matrix_a, bool transpose,
+ bool padding, int padded_height,
+ int padded_width, int height,
+ int width, int ld);
+
+enum gemm_type_t
+{
+ GEMM_TYPE_NONE = 0,
+ GEMM_TYPE_FAST_IMAGE_32_1,
+ GEMM_TYPE_FAST_IMAGE_32_2,
+ GEMM_TYPE_FAST_IMAGE_B_IMAGE,
+ GEMM_TYPE_MAX
+};
+
+template<typename Dtype>
+static bool ocl4dnnFastImageGEMM(const CBLAS_TRANSPOSE TransA,
+ const CBLAS_TRANSPOSE TransB, const int32_t M,
+ const int32_t N, const int32_t K, const Dtype alpha,
+ const UMat A, const int32_t offA, const UMat B,
+ const int32_t offB, const Dtype beta, UMat C,
+ const int32_t offC, bool is_image_a, bool is_image_b,
+ enum gemm_type_t gemm_type,
+ const size_t max_image_size)
+{
+ CHECK_EQ(gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_32_2 ||
+ gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE, true) << "Invalid fast image gemm type." << std::endl;
+
+ if (is_image_a)
+ {
+ CHECK_EQ(offA, 0) << "Invalid input image offset." << std::endl;
+ return false;
+ }
+
+ if (is_image_b)
+ {
+ CHECK_EQ(offB, 0) << "Invalid input image offset." << std::endl;
+ return false;
+ }
+
+ int widthA = (TransA == CblasNoTrans) ? K : M;
+ int heightA = (TransA == CblasNoTrans) ? M : K;
+ int widthB = (TransB == CblasNoTrans) ? N : K;
+ int heightB = (TransB == CblasNoTrans) ? K : N;
+
+ int ldA = widthA;
+ int ldB = widthB;
+ int ldC = N;
+
+ int A_start_x = 0, A_start_y = 0, B_start_x = 0;
+ int B_start_y = 0, C_start_x = 0, C_start_y = 0;
+ int blocksize = 1024;
+ if (gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
+ blocksize = max_image_size;
+ int blockA_width = blocksize;
+ int blockA_height = blocksize;
+ int blockB_width = blocksize;
+ int blockB_height = blocksize;
+ int blockC_width = blocksize;
+ int blockC_height = blocksize;
+
+ int use_buffer_indicator = 8;
+ // To fix the edge problem casued by the sub group block read.
+ // we have to pad the image if it's not multiple of tile.
+ // just padding one line is enough as the sub group block read
+ // will clamp to edge according to the spec.
+
+ ocl::Context ctx = ocl::Context::getDefault();
+ ocl::Queue queue = ocl::Queue::getDefault();
+
+ ocl::Image2D ImA;
+ ocl::Image2D ImB;
+
+ std::string kernel_name("gemm_");
+ if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
+ kernel_name += "32_1_";
+ else
+ kernel_name += "32_2_";
+
+ if (TransA == CblasNoTrans)
+ kernel_name += "N";
+ else
+ kernel_name += "T";
+
+ if (TransB == CblasNoTrans)
+ {
+ kernel_name += "N_";
+ } else {
+ kernel_name += "T_";
+ if (is_image_b || (K % use_buffer_indicator != 0))
+ {
+ kernel_name += "SCALAR_";
+ } else {
+ kernel_name += "BUFFER_";
+ }
+ }
+
+ if (alpha == 1)
+ kernel_name += "1_";
+ else
+ kernel_name += "0_";
+
+ if (beta == 0)
+ kernel_name += "0";
+ else
+ kernel_name += "1";
+
+ kernel_name += "_float";
+
+ ocl::Kernel oclk_gemm_float(kernel_name.c_str(), ocl::dnn::gemm_image_oclsrc);
+ if (oclk_gemm_float.empty())
+ return false;
+
+ while (C_start_y < M)
+ {
+ blockC_width = std::min(static_cast<int>(N) - C_start_x, blocksize);
+ blockC_height = std::min(static_cast<int>(M) - C_start_y, blocksize);
+
+ int isFirstColBlock = 1;
+ for (int k = 0; k < K; k += blocksize)
+ {
+ blockA_width = std::min(widthA - A_start_x, blocksize);
+ blockA_height = std::min(heightA - A_start_y, blocksize);
+ blockB_width = std::min(widthB - B_start_x, blocksize);
+ blockB_height = std::min(heightB - B_start_y, blocksize);
+ int block_Ksize = std::min(static_cast<int>(K) - k, blocksize);
+
+ int padded_k = block_Ksize + ((block_Ksize & 7) ? (8 - (block_Ksize & 7)) : 0);
+ int imageA_w = (TransA == CblasNoTrans) ? padded_k : blockA_width;
+ int imageA_h = (TransA == CblasNoTrans) ? blockA_height : padded_k;
+ int imageB_w = (TransB == CblasNoTrans) ? blockB_width : padded_k;
+ int imageB_h = (TransB == CblasNoTrans) ? padded_k : blockB_height;
+
+ int blockA_offset = offA + A_start_y * ldA + A_start_x;
+ int blockB_offset = offB + B_start_y * ldB + B_start_x;
+ int blockC_offset = offC + C_start_y * ldC + C_start_x;
+ if (TransB == CblasNoTrans)
+ {
+ bool padding_A = false;
+ bool padding_B = false;
+
+ if (!is_image_a && !is_image_b)
+ {
+ if (M * K < N * K)
+ padding_B = true;
+ else
+ padding_A = true;
+ }
+
+ if (!is_image_a)
+ {
+ ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
+ true, TransA != CblasNoTrans,
+ padding_A, imageA_h, imageA_w,
+ blockA_height, blockA_width, ldA);
+ }
+ if (!is_image_b)
+ {
+ ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
+ false, false,
+ padding_B, imageB_h, imageB_w,
+ blockB_height, blockB_width, ldB);
+ }
+ } else {
+ // We will use normal read_imagef to read image B when B has transpose.
+ // thus we don't need to pad image A at all.
+ if (!is_image_a)
+ {
+ bool padding;
+ padding = !is_image_b;
+ ImA = ocl4dnnGEMMCopyBufferToImage<Dtype>(A, blockA_offset,
+ true, TransA != CblasNoTrans,
+ padding, imageA_h, imageA_w,
+ blockA_height, blockA_width, ldA);
+ }
+
+ if (!is_image_b && (K % use_buffer_indicator != 0))
+ {
+ ImB = ocl4dnnGEMMCopyBufferToImage<Dtype>(B, blockB_offset,
+ false, true, false, imageB_h, imageB_w,
+ blockB_height, blockB_width, ldB);
+ }
+ }
+
+ size_t global[2];
+ if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 || gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
+ {
+ global[0] = (size_t)( blockC_width + 7 ) & ~7;
+ } else {
+ global[0] = (size_t)( (blockC_width / 2 ) + 7 ) ^ ~7;
+ }
+ global[1] = (size_t)(blockC_height + 31) / 32;
+
+ size_t local[2];
+ local[0] = 8;
+ local[1] = 1;
+
+ cl_uint arg_idx = 0;
+ if (is_image_a)
+ oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(A));
+ else
+ oclk_gemm_float.set(arg_idx++, ImA);
+
+ if (TransB == CblasNoTrans || is_image_b || (K % use_buffer_indicator != 0))
+ {
+ if (is_image_b)
+ oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
+ else
+ oclk_gemm_float.set(arg_idx++, ImB);
+ } else {
+ oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrReadOnly(B));
+ oclk_gemm_float.set(arg_idx++, blockB_offset);
+ oclk_gemm_float.set(arg_idx++, ldB);
+ }
+ oclk_gemm_float.set(arg_idx++, ocl::KernelArg::PtrWriteOnly(C));
+ oclk_gemm_float.set(arg_idx++, blockC_offset);
+ oclk_gemm_float.set(arg_idx++, blockC_height);
+ oclk_gemm_float.set(arg_idx++, blockC_width);
+ oclk_gemm_float.set(arg_idx++, ldC);
+ oclk_gemm_float.set(arg_idx++, alpha);
+ oclk_gemm_float.set(arg_idx++, beta);
+ oclk_gemm_float.set(arg_idx++, padded_k);
+ if (TransB != CblasNoTrans)
+ oclk_gemm_float.set(arg_idx++, block_Ksize);
+ oclk_gemm_float.set(arg_idx++, isFirstColBlock);
+
+ if (!oclk_gemm_float.run(2, global, local, false))
+ return false;
+
+ if (TransA == CblasNoTrans)
+ A_start_x += blockA_width;
+ else
+ A_start_y += blockA_height;
+
+ if (TransB == CblasNoTrans)
+ B_start_y += blockB_height;
+ else
+ B_start_x += blockB_width;
+
+ isFirstColBlock = 0;
+ }
+
+ C_start_x += blockC_width;
+ if (TransA == CblasNoTrans)
+ A_start_x = 0;
+ else
+ A_start_y = 0;
+ if (TransB == CblasNoTrans)
+ {
+ B_start_x += blockB_width;
+ B_start_y = 0;
+ } else {
+ B_start_y += blockB_height;
+ B_start_x = 0;
+ }
+ if (C_start_x >= N)
+ {
+ C_start_x = 0;
+ B_start_x = 0;
+ B_start_y = 0;
+ C_start_y += blockC_height;
+ if (TransA == CblasNoTrans)
+ A_start_y += blockA_height;
+ else
+ A_start_x += blockA_width;
+ }
+ }
+
+ return true;
+}
+
+template<typename Dtype>
+bool ocl4dnnGEMMCommon(const CBLAS_TRANSPOSE TransB,
+ const int32_t M, const int32_t N, const int32_t K,
+ const UMat A, const UMat B,
+ const UMat B_image, UMat C,
+ const size_t max_image_size)
+{
+ gemm_type_t gemm_type = GEMM_TYPE_FAST_IMAGE_32_1;
+
+ if (gemm_type == GEMM_TYPE_FAST_IMAGE_32_1 ||
+ gemm_type == GEMM_TYPE_FAST_IMAGE_32_2)
+ {
+ return ocl4dnnFastImageGEMM<Dtype>(CblasNoTrans, TransB, M, N, K,
+ (Dtype)1., A, 0, B, 0, (Dtype)0., C,
+ 0, false, false, gemm_type, max_image_size);
+ }
+ else if (gemm_type == GEMM_TYPE_FAST_IMAGE_B_IMAGE)
+ {
+ return ocl4dnnFastImageGEMM<Dtype>(CblasNoTrans, TransB, M, N, K,
+ (Dtype)1., A, 0, B_image, 0, (Dtype)0., C,
+ 0, false, true,
+ GEMM_TYPE_FAST_IMAGE_B_IMAGE,
+ max_image_size);
+ }
+ return false;
+}
+
+template bool ocl4dnnGEMMCommon<float>(const CBLAS_TRANSPOSE TransB,
+ const int32_t M, const int32_t N, const int32_t K,
+ const UMat A, const UMat B,
+ const UMat B_image, UMat C,
+ const size_t max_image_size);
+
+template<typename Dtype>
+bool ocl4dnnGEMV(const CBLAS_TRANSPOSE TransA,
+ const int32_t M, const int32_t N, const Dtype alpha,
+ const UMat A, const int32_t offA, const UMat x,
+ const int32_t offx, const Dtype beta, UMat y,
+ const int32_t offy)
+{
+ return false;
+}
+
+template<>
+bool ocl4dnnGEMV<float>(const CBLAS_TRANSPOSE TransA,
+ const int32_t M, const int32_t N, const float alpha,
+ const UMat A, const int32_t offA, const UMat x,
+ const int32_t offx, const float beta, UMat y,
+ const int32_t offy)
+{
+ ocl::Queue queue = ocl::Queue::getDefault();
+ bool ret = false;
+
+ if (TransA == CblasNoTrans)
+ {
+ ocl::Kernel k(CL_KERNEL_SELECT("matvec_mul4"), cv::ocl::dnn::matvec_mul_oclsrc);
+ if (k.empty())
+ return false;
+
+ uint row_size = M;
+ uint col_size = N;
+ size_t localsize[] = { 128 };
+ size_t globalsize[] = { row_size / 4 * localsize[0] };
+
+ uint argId = 0;
+ k.set(argId++, ocl::KernelArg::PtrReadOnly(A));
+ k.set(argId++, offA);
+ k.set(argId++, cl_uint(col_size));
+ k.set(argId++, cl_uint(col_size%4));
+ k.set(argId++, ocl::KernelArg::PtrReadOnly(x));
+ k.set(argId++, offx);
+ k.set(argId++, alpha);
+ k.set(argId++, beta);
+ k.set(argId++, ocl::KernelArg::PtrWriteOnly(y));
+ k.set(argId++, offy);
+ k.set(argId++, NULL, localsize[0] * sizeof(cl_float4));
+
+ ret = k.run(1, globalsize, localsize, false);
+
+ if ((row_size % 4) != 0 && ret)
+ {
+ ocl::Kernel k_1(CL_KERNEL_SELECT("matvec_mul1"), cv::ocl::dnn::matvec_mul_oclsrc);
+ size_t localsize[] = { 128 };
+ size_t globalsize[] = { row_size % 4 * localsize[0] };
+ uint row_offset = row_size - (row_size % 4);
+
+ uint argId = 0;
+ k_1.set(argId++, ocl::KernelArg::PtrReadOnly(A));
+ k_1.set(argId++, offA);
+ k_1.set(argId++, cl_uint(col_size));
+ k_1.set(argId++, cl_uint(row_offset));
+ k_1.set(argId++, cl_uint(col_size%4));
+ k_1.set(argId++, ocl::KernelArg::PtrReadOnly(x));
+ k_1.set(argId++, offx);
+ k_1.set(argId++, alpha);
+ k_1.set(argId++, beta);
+ k_1.set(argId++, ocl::KernelArg::PtrWriteOnly(y));
+ k_1.set(argId++, offy);
+ k_1.set(argId++, NULL, localsize[0] * sizeof(cl_float));
+
+ ret = k_1.run(1, globalsize, localsize, false);
+ }
+ }
+ return ret;
+}
+
+template<typename Dtype>
+bool ocl4dnnAXPY(const int32_t N, const Dtype alpha,
+ const UMat X, const int32_t offX, UMat Y,
+ const int32_t offY)
+{
+ ocl::Context ctx = ocl::Context::getDefault();
+
+ ocl::Kernel oclk_axpy(CL_KERNEL_SELECT("axpy"), cv::ocl::dnn::math_oclsrc);
+ if (oclk_axpy.empty())
+ return false;
+
+ size_t global[] = { 128 * 128 };
+ size_t local[] = { 128 };
+
+ cl_uint argIdx = 0;
+ oclk_axpy.set(argIdx++, N);
+ oclk_axpy.set(argIdx++, alpha);
+ oclk_axpy.set(argIdx++, ocl::KernelArg::PtrReadOnly(X));
+ oclk_axpy.set(argIdx++, offX);
+ oclk_axpy.set(argIdx++, ocl::KernelArg::PtrWriteOnly(Y));
+ oclk_axpy.set(argIdx++, offY);
+
+ return oclk_axpy.run(1, global, local, false);
+}
+
+template bool ocl4dnnAXPY<float>(const int32_t N, const float alpha,
+ const UMat X, const int32_t offX,
+ UMat Y, const int32_t offY);
+
+#endif // HAVE_OPENCL
+
+} // namespace ocl4dnn
+} // namespace dnn
+} // namespce cv
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+
+#include <opencv2/core/utils/configuration.private.hpp>
+
+#include <string>
+#include <vector>
+#include <fstream>
+#include <sys/stat.h>
+#include <assert.h>
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "opencl_kernels_dnn.hpp"
+#include "math_functions.hpp"
+#include "default_kernel_config.hpp"
+
+#if defined WIN32 || defined _WIN32
+#include <windows.h>
+#include <direct.h>
+#endif
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+static cv::Mutex kernelConfigMutex;
+typedef std::map<std::string, std::string> kernel_hash_t;
+static kernel_hash_t kernelConfigMap;
+static bool defaultConfigLoaded = false;
+
+template<typename Dtype>
+OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
+{
+ bias_term_ = config.bias_term;
+ int dims = config.in_shape.size();
+ int spatial_dims = 2;
+
+ channels_ = config.in_shape[dims - spatial_dims - 1];
+ num_output_ = config.out_shape[dims - spatial_dims - 1];
+ group_ = config.group;
+
+ prev_kernel_type_ = -1;
+ tuned_ = false;
+
+ // assumption: spatial dimension is 2.
+ kernel_h_ = config.kernel.height;
+ kernel_w_ = config.kernel.width;
+ pad_h_ = config.pad.height;
+ pad_w_ = config.pad.width;
+ stride_h_ = config.stride.height;
+ stride_w_ = config.stride.width;
+ dilation_h_ = config.dilation.height;
+ dilation_w_ = config.dilation.width;
+ M_ = num_output_ / group_;
+ height_ = config.in_shape[dims - spatial_dims + 0];
+ width_ = config.in_shape[dims - spatial_dims + 1];
+ output_h_ = config.out_shape[dims - spatial_dims + 0];
+ output_w_ = config.out_shape[dims - spatial_dims + 1];
+ bottom_dim_ = channels_ * width_ * height_;
+ top_dim_ = num_output_ * output_w_ * output_h_;
+
+ cache_path_ = utils::getConfigurationParameterString("OPENCV_OCL4DNN_CONFIG_PATH", "");
+
+ use_cache_path_ = false;
+ if (!cache_path_.empty())
+ {
+#if defined _WIN32
+ struct _stat file_stat;
+ use_cache_path_ = _stat(cache_path_.c_str(), &file_stat) == 0 &&
+ ((_S_IFDIR & file_stat.st_mode) != 0);
+#else
+ struct stat file_stat;
+ use_cache_path_ = stat(cache_path_.c_str(), &file_stat) == 0 &&
+ S_ISDIR(file_stat.st_mode);
+#endif
+ if (!use_cache_path_)
+ {
+ static int warn_ = 0;
+ if (!warn_)
+ {
+ std::cerr
+ << "OpenCV(ocl4dnn): Kernel configuration cache directory doesn't exist: " << cache_path_ << std::endl
+ << std::endl;
+ warn_ = true;
+ }
+ }
+ }
+
+ force_auto_tuning_ =
+ (use_cache_path_ && !utils::getConfigurationParameterBool("OPENCV_OCL4DNN_DISABLE_AUTO_TUNING", false))
+ || utils::getConfigurationParameterBool("OPENCV_OCL4DNN_FORCE_AUTO_TUNING", false);
+}
+
+template<typename Dtype>
+OCL4DNNConvSpatial<Dtype>::~OCL4DNNConvSpatial()
+{
+ if (!swizzled_weights_umat.empty()) {
+ swizzled_weights_umat.release();
+ }
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::collectCommonInformation()
+{
+ addDef("Dtype", "float");
+ addDef("Dtype2", "float2");
+ addDef("Dtype4", "float4");
+ addDef("Dtype8", "float8");
+ addDef("Dtype16", "float16");
+ addDef("as_Dtype", "as_float");
+ addDef("as_Dtype2", "as_float2");
+ addDef("as_Dtype4", "as_float4");
+ addDef("as_Dtype8", "as_float8");
+ addDef("Dtype_ID", (int)CV_32F);
+ addDef("Dtype_SIZE", (int)sizeof(Dtype));
+}
+
+typedef enum {
+ KERNEL_TYPE_INTEL_IDLF = 2,
+ KERNEL_TYPE_BASIC = 4,
+ KERNEL_TYPE_GEMM_LIKE = 5
+} ocl4dnnConvSpatialKernelType_t;
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setupKernelDetails(int32_t kernelType,
+ int32_t blockM,
+ int32_t blockK,
+ int32_t blockN)
+{
+ std::string kernelUKey;
+ int32_t simd_size;
+
+ if (kernelType == KERNEL_TYPE_INTEL_IDLF) {
+ simd_size = blockN;
+ kernelUKey = generateSpecificKey(KERNEL_TYPE_INTEL_IDLF, blockM, blockK, 1);
+
+ // kernel name
+ kernel_name_ = "IDLF_";
+ kernel_name_ += kernelUKey;
+ if (simd_size == 16)
+ kernel_name_ += "_SIMD16";
+ else
+ kernel_name_ += "_SIMD8";
+
+ // options
+ options_ << " -cl-fast-relaxed-math -D KERNEL_IDLF -D convolve_simd=" << kernel_name_;
+ if (clOptionSupport("-cl-no-subgroup-ifp"))
+ options_ << " -cl-no-subgroup-ifp ";
+
+ // defs
+ int32_t output_width = output_w_;
+ int32_t output_height = output_h_;
+ int32_t output_block_width = blockM;
+ int32_t output_block_height = blockK;
+ const int32_t last_block_width = (output_width % output_block_width == 0) ?
+ output_block_width : output_width % output_block_width;
+ const int32_t last_block_height = (output_height % output_block_height == 0) ?
+ output_block_height : output_height % output_block_height;
+ int tile_x = alignSize((output_block_width - 1) * stride_w_ + kernel_w_ * dilation_w_, 4);
+ int tile_y = (output_block_height -1) * stride_h_ + kernel_h_ * dilation_h_;
+ int tile_y_stride = (4 * simd_size) / tile_x;
+ int invec_size = divUp(tile_y, tile_y_stride);
+
+ addDef("SIMD_SIZE", simd_size);
+ addDef("filter_qualifier", "__global");
+ addDef("OUT_BLOCK_WIDTH", output_block_width);
+ addDef("OUT_BLOCK_HEIGHT", output_block_height);
+ addDef("LAST_BLOCK_WIDTH", last_block_width);
+ addDef("LAST_BLOCK_HEIGHT", last_block_height);
+ addDef("INPUT_DEPTH", channels_ / group_);
+ addDef("TOTAL_INPUT_DEPTH_SIZE", channels_);
+ addDef("TOTAL_OUTPUT_DEPTH", num_output_);
+ addDef("INPUT_START_X", 0);
+ addDef("INPUT_START_Y", 0);
+ addDef("INPUT_START_Z", 0);
+ addDef("NUM_FILTERS", M_);
+ addDef("OUT_BUFF_OFFSET", 0);
+ addDef("TILE_X", tile_x);
+ addDef("TILE_Y", tile_y);
+ addDef("TILE_Y_STRIDE", tile_y_stride);
+ addDef("INVEC_SIZE", invec_size);
+ addDef("ALIGNED_NUM_FILTERS", (int)alignSize(M_, simd_size));
+ addDef("OUT_BLOCK_SIZE", (output_block_width*output_block_height));
+ addDef("APPLY_BIAS", bias_term_);
+
+ src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
+ }
+ else if (kernelType == KERNEL_TYPE_BASIC)
+ {
+ addDef("KERNEL_BASIC");
+
+ kernelUKey = generateSpecificKey(KERNEL_TYPE_BASIC, blockM, blockK, blockN);
+ kernel_name_ = "BASIC_";
+ kernel_name_ += kernelUKey;
+
+ // opts
+ options_ << " -cl-fast-relaxed-math -D ConvolveBasic=" << kernel_name_;
+ if (clOptionSupport("-cl-no-subgroup-ifp"))
+ options_ << " -cl-no-subgroup-ifp ";
+
+ // defs
+ addDef("CHANNELS", channels_ / group_);
+ addDef("APPLY_BIAS", bias_term_);
+ addDef("OUTPUT_Z", M_);
+ addDef("ZPAR", 1);
+
+ src_ = cv::ocl::dnn::conv_layer_spatial_oclsrc;
+ }
+ else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
+ {
+ simd_size = blockK;
+ kernelUKey = generateSpecificKey(KERNEL_TYPE_GEMM_LIKE, blockM, blockK, blockN);
+
+ kernel_name_ = "U_GEMM_LIKE_CONV_";
+ kernel_name_ += kernelUKey.c_str();
+ kernel_name_ += (blockK == 8) ? "_SIMD8" : "_SIMD16";
+ std::stringstream kernelDef;
+ kernelDef << "GEMM_LIKE_CONV_" << blockN << "_" << blockM;
+ if (blockK == 16)
+ kernelDef << "_SIMD16";
+
+ // Build list of options and defines
+ options_ << " -cl-fast-relaxed-math " << " -D " << kernelDef.str()
+ << " -D Conv_Interleaved=" << kernel_name_.c_str();
+ options_ << " -cl-mad-enable";
+ if (clOptionSupport("-cl-no-subgroup-ifp"))
+ options_ << " -cl-no-subgroup-ifp ";
+
+ addDef("INPUT_DEPTH", channels_);
+ addDef("WIDTH1", M_);
+ addDef("OUT_PADDING_LEFT", 0);
+ addDef("OUT_PADDING_HEIGHT", 0);
+ addDef("OUT_DEPTH", M_);
+ addDef("NUM_BATCHES", num_);
+ addDef("DY", blockM);
+ addDef("DX", blockN);
+ addDef("KERNEL_WIDTH_DIV2", kernel_w_ / 2);
+ addDef("KERNEL_SLICE_DIV2", (kernel_w_ * kernel_h_) / 2);
+ addDef("TILE_N_LAST", M_ % 32);
+ addDef("TILE_N_LAST_DIV8", (M_ % 32) / 8);
+ addDef("APPLY_BIAS", bias_term_);
+ src_ = ocl::dnn::conv_layer_spatial_oclsrc;
+ }
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setupKernel()
+{
+ collectCommonInformation();
+
+ addDef("KERNEL_WIDTH", kernel_w_);
+ addDef("KERNEL_HEIGHT" , kernel_h_);
+ addDef("STRIDE_X", stride_w_);
+ addDef("STRIDE_Y", stride_h_);
+ addDef("DILATION_X", dilation_w_);
+ addDef("DILATION_Y", dilation_h_);
+ if (kernelType_ != KERNEL_TYPE_BASIC)
+ {
+ addDef("INPUT_PAD_W", pad_w_);
+ addDef("INPUT_PAD_H", pad_h_);
+ }
+
+ setupKernelDetails(kernelType_, blockM_, blockK_, blockN_);
+}
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
+ const UMat& weight,
+ const UMat& bias,
+ UMat& top,
+ int32_t numImages)
+{
+ num_ = numImages;
+
+ prepareKernel(bottom, top, weight, bias, numImages);
+ return convolve(bottom, top, weight, bias, numImages, bestKernelConfig, cv::ocl::Queue::getDefault());
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::calculateBenchmark(const UMat &bottom, UMat &verifyTop,
+ const UMat &weight, const UMat &bias,
+ int32_t numImages)
+{
+ options_.str(""); options_.clear(); // clear contents and state flags
+ createBasicKernel(1, 1, 1);
+ kernel_index_ = kernelQueue.size() - 1;
+ convolve(bottom, verifyTop, weight, bias, numImages, kernelQueue[kernel_index_], cv::ocl::Queue::getDefault());
+ CV_Assert(phash.find(kernelQueue[kernel_index_]->kernelName) != phash.end());
+ //unloadProgram(kernelQueue[kernel_index_]->kernelName);
+ kernelQueue.pop_back();
+ return;
+}
+
+#define dbg
+#ifdef dbg
+#define dbgPrint(x) (x)
+#else
+#define dbgPrint(x)
+#endif
+
+// For large enough input size, we do not need to tune kernels for different
+// size. The reason is with large input size, there will be enough work items
+// to feed al the EUs.
+// FIXME for the gemm like convolution, switch back to eaxct image size.
+
+#define TUNING_SIZE(x) ((x) > 256 ? 256 : (alignSize(x, 16)))
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::generateKey()
+{
+ std::stringstream keyBuilder;
+ // FIXME: to support fuse?
+ keyBuilder << "k" << kernel_w_ << "x" << kernel_h_ << "_"
+ << "cn" << channels_ << "_"
+ << "g" << group_ << "_"
+ << "s" << stride_w_ << "x" << stride_h_ << "_"
+ << "d" << dilation_w_ << "x" << dilation_h_ << "_"
+ << "b" << bias_term_ << "_"
+ << "in" << TUNING_SIZE(width_) << "x" << TUNING_SIZE(height_) << "_"
+ << "p" << pad_w_ << "x" << pad_h_ << "_"
+ << "num" << num_ << "_"
+ << "M" << M_;
+
+ key_ = ocl::Device::getDefault().vendorName() + "_EU" + cv::format("%d", ocl::Device::getDefault().maxComputeUnits()) + "_" + keyBuilder.str();
+ key_sanitized_ = key_;
+ for (size_t i = 0; i < key_sanitized_.size(); i++)
+ {
+ char c = key_sanitized_[i];
+ if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'))
+ {
+ key_sanitized_[i] = '_';
+ }
+ }
+ // TODO add hash?
+ // key_sanitized_ = key_sanitized_ + cv::format("_%08llx", crc64((uchar*)key_.c_str(), key_.size()));
+ short_key_ = keyBuilder.str();
+}
+
+template<typename Dtype>
+std::string OCL4DNNConvSpatial<Dtype>::generateSpecificKey(int32_t type, int32_t blockWidth,
+ int32_t blockHeight, int32_t blockDepth)
+{
+ std::stringstream keyBuilder;
+ keyBuilder << short_key_
+ << "_" << type
+ << "_" << blockWidth
+ << "_" << blockHeight
+ << "_" << blockDepth;
+ return keyBuilder.str();
+}
+
+template<typename Dtype>
+void interleaveMatrix(Dtype* mem_dst, const Dtype *mem,
+ int r, int c, int interleavedRows, int nonInterleavedRows,
+ int blockWidth, int rowAlignment )
+{
+ CHECK_EQ(interleavedRows % 2, 0) <<
+ "interleaveMatrix only supports even values for interleavedRows.";
+
+ size_t memSize = r * c * sizeof(float);
+ size_t dstSize = memSize *
+ (interleavedRows + nonInterleavedRows * 2) /
+ (interleavedRows + nonInterleavedRows);
+ memset(mem_dst, 0, dstSize); // NOLINT
+
+ const int xStride = blockWidth;
+ const int yStride = c * 2;
+ const Dtype *pSrc = mem;
+ Dtype* pDst = mem_dst;
+ for (int y = 0; y < r;) {
+ for (int rows = 0; rows < interleavedRows; rows += 2) {
+ if ( y >= r ) break;
+ if ((c % xStride) == 0) {
+ for (int x = 0; x < c / xStride; x++) {
+ memcpy(pDst + x * xStride * 2, // NOLINT
+ pSrc + x * xStride, xStride * sizeof(Dtype));
+ memcpy(pDst + x * xStride * 2 + xStride, // NOLINT
+ pSrc + x * xStride + c, xStride * sizeof(Dtype));
+ }
+ } else {
+ const int count = c / xStride;
+ int x = 0;
+ for (; x < count - 1; x++) {
+ memcpy(pDst + x * xStride * 2, // NOLINT
+ pSrc + x * xStride, xStride * sizeof(Dtype));
+ memcpy(pDst + x * xStride * 2 + xStride, // NOLINT
+ pSrc + x * xStride + c, xStride * sizeof(Dtype));
+ }
+ memcpy(pDst + x * xStride * 2, // NOLINT
+ pSrc + x * xStride, xStride * sizeof(Dtype));
+ }
+ pSrc += yStride;
+ pDst += yStride;
+ y += 2;
+ }
+
+ for (int rows = 0; rows < nonInterleavedRows; rows++) {
+ if (y >= r) break;
+ const int stride = rowAlignment;
+ int remaining = c;
+ for (int x = 0; x < c; x += stride) {
+ if (remaining >= stride) {
+ memcpy(pDst + x * 2, pSrc + x, stride * sizeof(Dtype)); // NOLINT
+ remaining -=stride;
+ } else {
+ memcpy(pDst + x * 2, pSrc + x, remaining * sizeof(Dtype)); // NOLINT
+ }
+ }
+ pSrc += yStride / 2;
+ pDst += yStride;
+ y++;
+ }
+ }
+}
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
+ int32_t swizzled_factor,
+ bool interleave)
+{
+ // Simply skip the weight swizzle if we already got a swizzled_weights_
+ // in test phase and not in auto tuning
+ // This requires we always call convolve again with the winner configuration
+ // during the auto tuning stage.
+ if (tuned_ && !swizzled_weights_umat.empty())
+ return true;
+
+ if (swizzled_weights_umat.empty())
+ swizzled_weights_umat.create(1, (int)alignSize(num_output_, 16) * channels_ *
+ kernel_h_ * (int)alignSize(kernel_w_, 2), CV_32FC1);
+
+ ocl::Queue queue = ocl::Queue::getDefault();
+ if (!interleave) {
+ cl_uint argIdx = 0;
+ int32_t channels = channels_ / group_;
+
+ ocl::Kernel oclk_copy_weight(CL_KERNEL_SELECT("copyWeightsSwizzled"),
+ cv::ocl::dnn::conv_spatial_helper_oclsrc);
+ if (oclk_copy_weight.empty())
+ return false;
+
+ oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+ oclk_copy_weight.set(argIdx++, ocl::KernelArg::PtrWriteOnly(swizzled_weights_umat));
+ oclk_copy_weight.set(argIdx++, kernel_w_);
+ oclk_copy_weight.set(argIdx++, kernel_h_);
+ oclk_copy_weight.set(argIdx++, channels);
+ oclk_copy_weight.set(argIdx++, num_output_);
+ oclk_copy_weight.set(argIdx++, swizzled_factor);
+
+ size_t global_work_size_copy[3] = {
+ (size_t) (alignSize(num_output_, swizzled_factor) * channels * kernel_w_ * kernel_h_), 1, 1 };
+
+ if (!oclk_copy_weight.run(3, global_work_size_copy, NULL, false))
+ {
+ std::cout << "Swizzle kernel run failed." << std::endl;
+ return false;
+ }
+ } else {
+ // assumption: kernel dimesion is 2
+ Mat weightMat = weight.getMat(ACCESS_READ);
+ Dtype* cpu_weight = (Dtype *)weightMat.ptr<float>();
+ Mat swizzledWeightMat = swizzled_weights_umat.getMat(ACCESS_WRITE);
+ Dtype* cpu_swizzled_weight = (Dtype *)swizzledWeightMat.ptr<float>();
+
+ int interleavedRows = (kernel_w_ / 2) * 2;
+ int nonInterleavedRows = kernel_w_ % 2;
+ int blockWidth = swizzled_factor; // should equal to simd size.
+ int rowAlignment = 32;
+ size_t interleaved_filter_size = M_ * kernel_w_ * kernel_h_ * channels_ * sizeof(Dtype);
+ Dtype * tmpSwizzledWeight = reinterpret_cast<Dtype*>(malloc(interleaved_filter_size));
+ CHECK_EQ(tmpSwizzledWeight != NULL, true) << "Failed to allocate temporary swizzled weight";
+ for (int od = 0; od < M_; od++)
+ for (int id = 0; id < channels_; id++)
+ for (int r = 0; r < kernel_h_; r++)
+ for (int c = 0; c < kernel_w_; c++)
+ tmpSwizzledWeight[((id * kernel_h_ + r)* kernel_w_ + c) * M_ + od] =
+ cpu_weight[((od * channels_ + id) * kernel_h_ + r)*kernel_w_+c];
+ interleaveMatrix(cpu_swizzled_weight,
+ tmpSwizzledWeight,
+ kernel_w_ * kernel_h_ * channels_, M_,
+ interleavedRows,
+ nonInterleavedRows,
+ blockWidth,
+ rowAlignment);
+ free(tmpSwizzledWeight);
+ }
+ return true;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::createBasicKernel(int32_t blockWidth,
+ int32_t blockHeight, int32_t blockDepth)
+{
+ kernelType_ = KERNEL_TYPE_BASIC;
+ blockM_ = blockWidth;
+ blockK_ = blockHeight;
+ blockN_ = blockDepth;
+ setupKernel();
+
+ ocl::Program program = compileKernel();
+ if (program.ptr())
+ {
+ int32_t workItemOutput[3] = { 1, 1, 1 };
+ size_t globalSize[3] = { (size_t)output_w_, (size_t)output_h_, (size_t)M_ };
+ kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &globalSize[0], (const size_t*)NULL, &workItemOutput[0],
+ false, KERNEL_TYPE_BASIC));
+ return true;
+ }
+ else
+ return false;
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
+ int32_t offset, int32_t size, bool write_only)
+{
+ cl_mem sub_mem;
+ cl_buffer_region region;
+ cl_int err;
+
+ region.origin = offset * sizeof(float);
+ region.size = size * sizeof(float);
+ sub_mem = clCreateSubBuffer((cl_mem)buffer.handle(ACCESS_READ),
+ write_only ? CL_MEM_WRITE_ONLY : CL_MEM_READ_ONLY,
+ CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
+ if (err)
+ {
+ std::cout << "Failed to create sub buffer." << std::endl;
+ return;
+ }
+
+ int step = sizeof(float), rows = size, cols = 1;
+ ocl::convertFromBuffer(sub_mem, step, rows, cols, CV_32FC1, sub_buffer);
+
+ //decrease ocl mem refcount
+ clReleaseMemObject(sub_mem);
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
+ const UMat &weight, const UMat &bias,
+ int32_t numImages, kernelConfig* config,
+ const cv::ocl::Queue& queue)
+{
+ ocl::Program program;
+ phash_t::iterator it = phash.find(config->kernelName);
+ if (it != phash.end())
+ program = it->second;
+ else
+ return false;
+
+ int32_t bias_offset;
+
+ if (config->kernelType == KERNEL_TYPE_INTEL_IDLF) {
+ if (!swizzleWeight(weight, config->workItem_output[2], false))
+ return false;
+ size_t total_bottom_size = bottom_dim_ * numImages;
+ size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
+ size_t total_bias_size = M_ * group_;
+ size_t total_top_size = top_dim_ * numImages;
+ for (int32_t g = 0; g < group_; ++g) {
+ bias_offset = M_ * g;
+ int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
+ int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
+ int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
+
+ ocl::Kernel kernel(config->kernelName.c_str(), program);
+ if (kernel.empty())
+ return false;
+
+ cl_uint argIdx = 0;
+
+ UMat img_buffer;
+ if (image_offset)
+ {
+ CreateSubBuffer(bottom, img_buffer, image_offset,
+ total_bottom_size - image_offset, false);
+ if (img_buffer.empty())
+ return false;
+
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
+ }
+ else
+ {
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+ }
+
+ UMat kernel_buffer;
+ if (kernel_offset)
+ {
+ CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
+ total_kernel_size - kernel_offset, false);
+ if (kernel_buffer.empty())
+ return false;
+
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
+ }
+ else
+ {
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
+ }
+
+ UMat bias_buffer;
+ if (bias_term_)
+ {
+ if (bias_offset)
+ {
+ CreateSubBuffer(bias, bias_buffer, bias_offset,
+ total_bias_size - bias_offset, false);
+ if (bias_buffer.empty())
+ return false;
+
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
+ }
+ else
+ {
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+ }
+ }
+
+ UMat out_buffer;
+ if (output_image_offset)
+ {
+ CreateSubBuffer(top, out_buffer, output_image_offset,
+ total_top_size - output_image_offset, true);
+ if (out_buffer.empty())
+ return false;
+
+ kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+ }
+ else
+ {
+ kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ }
+
+ kernel.set(argIdx++, (uint16_t)width_);
+ kernel.set(argIdx++, (uint16_t)height_);
+ kernel.set(argIdx++, (uint16_t)output_w_);
+ kernel.set(argIdx++, (uint16_t)output_h_);
+ if (!kernel.run(3, config->global_work_size, config->local_work_size, false))
+ {
+ std::cout << "IDLF kernel run failed." << std::endl;
+ return false;
+ }
+ }
+ } else if (config->kernelType == KERNEL_TYPE_GEMM_LIKE) {
+ if (!swizzleWeight(weight, config->workItem_output[1], true))
+ return false;
+ size_t total_bottom_size = bottom_dim_ * numImages;
+ size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
+ size_t total_bias_size = M_ * group_;
+ size_t total_top_size = top_dim_ * numImages;
+ for (int32_t g = 0; g < group_; ++g) {
+ bias_offset = M_ * g;
+ int32_t image_offset = width_ * height_ * (channels_ / group_) * g;
+ int32_t output_image_offset = output_w_ * output_h_ * M_ * g;
+ int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
+
+ ocl::Kernel kernel(config->kernelName.c_str(), program);
+ if (kernel.empty())
+ return false;
+
+ cl_uint argIdx = 0;
+
+ UMat img_buffer;
+ if (image_offset)
+ {
+ CreateSubBuffer(bottom, img_buffer, image_offset,
+ total_bottom_size - image_offset, false);
+ if (img_buffer.empty())
+ return false;
+
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(img_buffer));
+ }
+ else
+ {
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+ }
+
+ UMat kernel_buffer;
+ if (kernel_offset)
+ {
+ CreateSubBuffer(swizzled_weights_umat, kernel_buffer, kernel_offset,
+ total_kernel_size - kernel_offset, false);
+ if (kernel_buffer.empty())
+ return false;
+
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(kernel_buffer));
+ }
+ else
+ {
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(swizzled_weights_umat));
+ }
+
+ UMat bias_buffer;
+ if (bias_term_)
+ {
+ if (bias_offset)
+ {
+ CreateSubBuffer(bias, bias_buffer, bias_offset,
+ total_bias_size - bias_offset, false);
+ if (bias_buffer.empty())
+ return false;
+
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias_buffer));
+ }
+ else
+ {
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+ }
+ }
+
+ UMat out_buffer;
+ if (output_image_offset)
+ {
+ CreateSubBuffer(top, out_buffer, output_image_offset,
+ total_top_size - output_image_offset, true);
+ if (out_buffer.empty())
+ return false;
+
+ kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(out_buffer));
+ }
+ else
+ {
+ kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ }
+
+ kernel.set(argIdx++, (uint16_t)width_);
+ kernel.set(argIdx++, (uint16_t)height_);
+ kernel.set(argIdx++, (uint16_t)output_w_);
+ kernel.set(argIdx++, (uint16_t)output_h_);
+
+ int out_pitch_y = output_w_ * output_h_;
+ int out_pitch_z = out_pitch_y * M_;
+ int aligned_input_size = height_ * width_ * channels_ / group_;
+ int slice_pitch = width_ * height_;
+ kernel.set(argIdx++, (uint32_t)out_pitch_y);
+ kernel.set(argIdx++, (uint32_t)out_pitch_z);
+ kernel.set(argIdx++, (uint32_t)aligned_input_size);
+ kernel.set(argIdx++, (uint32_t)slice_pitch);
+
+ int blockM = config->workItem_output[0];
+ int blockK = config->workItem_output[1];
+ int blockN = config->workItem_output[2];
+ int alignedFilterWidth = alignSize(M_, blockN);
+ int alignedExpandHeight = alignSize(output_w_ * output_h_, blockM);
+ int globalWorkSizeDX = blockN;
+ int globalWorkSizeDY = blockM;
+ size_t sgemm_m = alignedExpandHeight;
+ size_t sgemm_n = alignedFilterWidth;
+ size_t gx = divUp(sgemm_n, globalWorkSizeDX);
+ size_t gy = divUp(sgemm_m, globalWorkSizeDY);
+ gy = alignSize(gy, blockK);
+ size_t global_size[3] = { gx, gy, config->global_work_size[2] };
+
+ if (!kernel.run(3, global_size, config->local_work_size, false))
+ {
+ std::cout << "GEMM like kernel run failed." << std::endl;
+ return false;
+ }
+ }
+ } else {
+ for (int32_t n = 0; n < numImages; ++n) {
+ for (int32_t g = 0; g < group_; ++g) {
+ bias_offset = M_ * g;
+ int32_t image_offset = n * bottom_dim_
+ + width_ * height_ * (channels_ / group_) * g;
+ int32_t output_image_offset = n * top_dim_
+ + output_w_ * output_h_ * M_ * g;
+
+ cl_uint argIdx = 0;
+ int32_t kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g;
+
+ ocl::Kernel kernel(config->kernelName.c_str(), program);
+ if (kernel.empty())
+ return false;
+
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+ kernel.set(argIdx++, image_offset);
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(weight));
+ kernel.set(argIdx++, kernel_offset);
+ if (bias_term_)
+ kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bias));
+ else
+ kernel.set(argIdx++, (void *)NULL);
+ kernel.set(argIdx++, bias_offset);
+ kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ kernel.set(argIdx++, output_image_offset);
+ kernel.set(argIdx++, (uint16_t)width_);
+ kernel.set(argIdx++, (uint16_t)height_);
+ kernel.set(argIdx++, (uint16_t)output_w_);
+ kernel.set(argIdx++, (uint16_t)output_h_);
+ kernel.set(argIdx++, (uint16_t)pad_w_);
+ kernel.set(argIdx++, (uint16_t)pad_h_);
+ if (!kernel.run(3, config->global_work_size,
+ (config->use_null_local) ? NULL : config->local_work_size,
+ false))
+ {
+ std::cout << "Basic kernel run failed." << std::endl;
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+template<>
+float OCL4DNNConvSpatial<float>::timedConvolve(const UMat &bottom, UMat &top,
+ const UMat &weight, const UMat &bias,
+ int32_t numImages, kernelConfig* config)
+{
+ cv::ocl::Queue profilingQueue;
+ try
+ {
+ profilingQueue = cv::ocl::Queue::getDefault().getProfilingQueue();
+ }
+ catch (const cv::Exception&)
+ {
+ static int warn_ = 0;
+ if (!warn_)
+ {
+ std::cout << "OpenCV(ocl4dnn): Can't create OpenCL profiling queue for auto-tuning." << std::endl;
+ warn_ = true;
+ }
+ return 1e6;
+ }
+
+ // warm up.
+ bool saved_tuned = tuned_;
+ tuned_ = false;
+ convolve(bottom, top, weight, bias, numImages, config, profilingQueue);
+
+ cv::ocl::Timer timer(profilingQueue);
+ timer.start();
+ bool res = true;;
+ dbgPrint(std::cout << "Benchmarking kernel: " << config->kernelName << std::endl);
+ tuned_ = true;
+ int loop_cnt = 4;
+ for (int i = 0; i < loop_cnt; i++) {
+ res = convolve(bottom, top, weight, bias, numImages, config, profilingQueue);
+ if (!res)
+ break;
+ }
+ tuned_ = saved_tuned;
+ timer.stop();
+ if (!res) {
+ config->tested = true;
+ config->verified = false;
+ return 1e5;
+ }
+
+ float elapsedTime = timer.milliSeconds() / loop_cnt;
+ #ifdef dbg
+ double out_w = output_w_;
+ double out_h = output_h_;
+ double out_z = M_;
+ double k_w = kernel_w_;
+ double k_h = kernel_h_;
+ double k_z = channels_;
+ double totalFlops = ((k_w*k_h*k_z -1)*2)*(out_w*out_h*out_z)*num_;
+ std::cout << "\tEstimated Gflops:" << ((totalFlops/1000)/1000)/1000
+ << std::endl;
+ std::cout << "\tEstimated GFLOPS/S: " << (((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime)
+ << std::endl;
+ #if 0
+ std::cout << "Estimated utilization: " <<
+ ((((totalFlops/1000)/1000)/1000)*(1000.0/elapsedTime))/880.0
+ << std::endl;
+ #endif
+ #endif
+ return elapsedTime;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::verifyResult(const UMat &bottom,
+ UMat &top,
+ const UMat &weight,
+ const UMat &bias,
+ int32_t numImages,
+ kernelConfig* config,
+ UMat &verifyTop)
+{
+
+ uint32_t verificationFail = 0;
+
+ if (config->verified)
+ return true;
+ else if (config->tested)
+ return false;
+
+ int32_t sz[4] = {numImages, num_output_, output_h_, output_w_};
+ top.zeros(4, sz, CV_32FC1);
+ bool saved_tuned = tuned_;
+ tuned_ = false;
+ convolve(bottom, top, weight, bias, numImages, config, cv::ocl::Queue::getDefault());
+ tuned_ = saved_tuned;
+
+ float *data = (float *)top.getMat(ACCESS_READ).ptr<float>();
+ float *verify_data = (float *)verifyTop.getMat(ACCESS_READ).ptr<float>();
+
+ for (int32_t n = 0; n < num_; ++n) {
+ for (int32_t g = 0; g < group_; ++g) {
+ int32_t output_image_offset = n * top_dim_ + output_w_ * output_h_ * M_ * g;
+ for (int out_ch = 0; out_ch < M_ && !verificationFail; out_ch++)
+ for (int h = 0; h < output_h_ && !verificationFail; h++)
+ for (int w = 0; w < output_w_; w++) {
+ size_t offset = output_image_offset + out_ch * output_w_ * output_h_ + h * output_w_ + w;
+ if (fabs(data[offset] - verify_data[offset]) > 0.1 * fabs(verify_data[offset]) &&
+ !(fabs(verify_data[offset]) < 1.e-3 &&
+ fabs(data[offset] - verify_data[offset]) < 1.e-4))
+ {
+ dbgPrint(printf("test verification failed @ image %d group %d"
+ "out_ch %d h %d w %d got %G expected %G\n",
+ n, g, out_ch, h, w, data[offset], verify_data[offset]));
+ verificationFail = 1;
+ goto out;
+ }
+ }
+ }
+ }
+out:
+ if (verificationFail == 1)
+ return false;
+ else
+ return true;
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::unloadProgram(const std::string& kernelName)
+{
+ ocl::Program program;
+ phash_t::iterator it = phash.find(kernelName);
+ if (it != phash.end())
+ {
+ program = it->second;
+ it->second = ocl::Program();
+ }
+ else
+ return;
+
+ ocl::Context ctx = ocl::Context::getDefault();
+ ctx.unloadProg(program);
+}
+
+template<typename Dtype>
+ocl::Program OCL4DNNConvSpatial<Dtype>::compileKernel()
+{
+ phash_t::iterator it = phash.find(kernel_name_);
+ if (it != phash.end())
+ {
+ return it->second;
+ }
+
+ String errmsg;
+ ocl::Context ctx = ocl::Context::getDefault();
+ std::string options = options_.str();
+ CV_Assert(options.size() != 0);
+ ocl::Program program = ctx.getProg(src_, options, errmsg);
+
+ phash.insert(std::pair<std::string, ocl::Program>(kernel_name_, program));
+ if (!program.ptr())
+ {
+ std::cout << "Failed to compile kernel: " << kernel_name_
+ << ", buildflags: " << options
+ << ", errmsg: " << errmsg << std::endl;
+ }
+ return program;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::createGEMMLikeConvKernel(int32_t blockM,
+ int32_t blockK,
+ int32_t blockN)
+{
+ int32_t simd_size = blockK;
+
+ int workItemOutput[3] = { blockM, blockK, blockN };
+ size_t gx = (size_t)divUp(M_, blockN);
+ size_t gy = (size_t)divUp(output_w_ * output_h_, blockM);
+ gy = alignSize(gy, simd_size);
+ size_t gz = num_;
+ size_t global_size[3] = { gx, gy, gz };
+ size_t local_size[3] = { 1, static_cast<size_t>(simd_size), 1 };
+
+ kernelType_ = KERNEL_TYPE_GEMM_LIKE;
+ blockM_ = blockM;
+ blockK_ = blockK;
+ blockN_ = blockN;
+ setupKernel();
+
+ ocl::Program program = compileKernel();
+ if (program.ptr())
+ {
+ size_t workgroupSize_used;
+ ocl::Kernel kernel(kernel_name_.c_str(), program);
+ if (kernel.empty())
+ return false;
+
+ workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
+ if (workgroupSize_used != simd_size)
+ {
+ std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
+ std::cerr << " does not equal the size (" << simd_size << ") kernel source required." << std::endl;
+ std::cerr << " Skip this kernel " << kernel_name_ << std::endl;
+ unloadProgram(kernel_name_);
+ return false;
+ }
+ else
+ {
+ kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
+ true, KERNEL_TYPE_GEMM_LIKE));
+ return true;
+ }
+ }
+ else
+ return false;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::setupIDLF(int32_t blockWidth,
+ int32_t blockHeight,
+ int32_t simd_size)
+{
+ int32_t workItemOutput[3] = { blockWidth, blockHeight, simd_size };
+ const int32_t num_output_maps = M_;
+ int32_t output_width = output_w_;
+ int32_t output_height = output_h_;
+ int32_t output_block_width = blockWidth;
+ int32_t output_block_height = blockHeight;
+ int32_t num_batches = num_;
+
+ size_t global_size[3] = {
+ (size_t)divUp(output_width, output_block_width),
+ (size_t)divUp(output_height, output_block_height),
+ (size_t)num_batches * alignSize(num_output_maps, simd_size) };
+ size_t local_size[3] = { 1, 1, static_cast<size_t>(simd_size) };
+
+ kernelType_ = KERNEL_TYPE_INTEL_IDLF;
+ blockM_ = blockWidth;
+ blockK_ = blockHeight;
+ blockN_ = simd_size;
+
+ setupKernel();
+
+ ocl::Program program = compileKernel();
+ if (program.ptr())
+ {
+ size_t workgroupSize_used;
+ ocl::Kernel kernel(kernel_name_.c_str(), program);
+ if (kernel.empty())
+ return false;
+
+ workgroupSize_used = kernel.preferedWorkGroupSizeMultiple();
+ if (workgroupSize_used != simd_size)
+ {
+ std::cerr << "OpenCV(ocl4dnn): The OpenCL compiler chose a simd size (" << workgroupSize_used << ") that " << std::endl;
+ std::cerr << " does not equal the size (" << simd_size << ") kernel source required." << std::endl;
+ std::cerr << " Skip this kernel " << kernel_name_ << std::endl;
+ unloadProgram(kernel_name_);
+ return false;
+ }
+ else
+ {
+ kernelQueue.push_back(makePtr<kernelConfig>(kernel_name_, &global_size[0], &local_size[0], &workItemOutput[0],
+ true, KERNEL_TYPE_INTEL_IDLF));
+ return true;
+ }
+ }
+ else
+ return false;
+}
+
+template<>
+bool OCL4DNNConvSpatial<float>::createConvolutionKernel(int32_t kernelType,
+ int32_t blockWidth,
+ int32_t blockHeight,
+ int32_t blockDepth)
+{
+ kernelType_ = kernelType;
+ options_.str(""); options_.clear(); // clear contents and state flags
+ src_ = ocl::ProgramSource();
+
+ if (kernelType == KERNEL_TYPE_INTEL_IDLF)
+ return setupIDLF(blockWidth, blockHeight, blockDepth);
+ else if (kernelType == KERNEL_TYPE_BASIC)
+ return createBasicKernel(blockWidth, blockHeight, blockDepth);
+ else if (kernelType == KERNEL_TYPE_GEMM_LIKE)
+ return createGEMMLikeConvKernel(blockWidth, blockHeight, blockDepth);
+ else
+ CV_Assert(0 && "Internal error");
+ return false;
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems)
+{
+ if (ocl::Device::getDefault().intelSubgroupsSupport()) {
+ /* IDLF kernels are using Intel specific extension which make
+ them intel only. */
+ // Generates static key_
+ int max_compute_units = ocl::Device::getDefault().maxComputeUnits();
+ int kernelCnt = 0;
+ if (group_ == 1 && ((M_ % 8 == 0) && (M_ % 32 != 24))) {
+ tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 8, 32));
+ tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 2, 8, 32));
+
+ if (kernel_w_ < 4 && M_ % 32 == 0)
+ tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_GEMM_LIKE, 1, 16, 32));
+ }
+
+ for (int simd_size = 8; simd_size <= 16; simd_size += 8) {
+ if (simd_size == 8 && !((group_ == 1 || M_ % 8 == 0)))
+ continue;
+ if (simd_size == 16 && !(group_ == 1 || M_ % 16 == 0))
+ continue;
+ const int width_max = 14, height_max = 8, block_size_max = 32;
+ for (uint32_t width = width_max; width > 0; width--) {
+ int candidate = 0;
+ if (width > output_w_)
+ continue;
+ for (uint32_t height = height_max; height > 0; height--) {
+ if (width * height > block_size_max || height > output_h_)
+ continue;
+ // Only when the work items count is less than the device
+ // max work items or the M_ is less than 16, we will tune
+ // for simd 8.
+ if (simd_size == 8 &&
+ M_ >= 16 &&
+ ((num_ * M_ * output_w_ * output_h_ / static_cast<float>(width * height)) >=
+ max_compute_units * 7 * 16))
+ continue;
+ int actual_tile_x = kernel_w_ * dilation_w_ + (width - 1) * stride_w_;
+ int tile_x = alignSize(actual_tile_x, 4);
+ int tile_y = kernel_h_ * dilation_h_ + (height - 1) * stride_h_;
+ if (tile_x > (4 * simd_size))
+ continue;
+ // If actual_tile_x is multiple of 4, we may waste some IO bandwidth.
+ // This could reduce 75% tuning candidates. It has slightly performance
+ // impact for the final tuning result, less than 2% for most cases.
+ if (actual_tile_x % 4 != 0)
+ continue;
+ if ((width * height + divUp(tile_x * tile_y, simd_size)) > block_size_max)
+ continue;
+ int tile_y_stride = (4 * simd_size) / tile_x;
+
+ if (divUp(tile_y, tile_y_stride) < 4) {
+ tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_INTEL_IDLF, width, height, simd_size));
+ candidate++;
+ }
+ if (candidate >= 4 && height == 2)
+ break;
+ }
+ kernelCnt += candidate;
+ if (kernelCnt >= 12 && width == 2)
+ break;
+ }
+ }
+ }
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::useFirstAvailable(const UMat &bottom,
+ UMat &top,
+ const UMat &weight,
+ const UMat &bias,
+ int32_t numImages,
+ UMat &verifyTop)
+{
+ std::vector< cv::Ptr<tunerParam> > tunerItems;
+ generateTunerItems(tunerItems);
+ tunerItems.push_back(makePtr<tunerParam>(KERNEL_TYPE_BASIC, 1, 1, 1));
+
+ for (int i = 0; i < tunerItems.size(); i++) {
+ if (createConvolutionKernel(tunerItems[i]->kernelType,
+ tunerItems[i]->blockWidth,
+ tunerItems[i]->blockHeight,
+ tunerItems[i]->blockDepth)) {
+ int kernelIdx = kernelQueue.size() - 1;
+ if (verifyResult(bottom, top, weight, bias, numImages, kernelQueue[kernelIdx], verifyTop)) {
+ bestKernelConfig = kernelQueue[kernelIdx];
+ if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF &&
+ bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
+ if (!swizzled_weights_umat.empty())
+ swizzled_weights_umat.release();
+
+ for (int32_t j = 0; j < kernelIdx; j++) {
+ CV_Assert(phash.find(kernelQueue[j]->kernelName) != phash.end());
+ unloadProgram(kernelQueue[j]->kernelName);
+ }
+ kernelQueue.clear();
+ tuned_ = true;
+ break;
+ }
+ }
+ }
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::cacheTunedConfig()
+{
+ if (tuned_)
+ {
+ cv::AutoLock lock(kernelConfigMutex);
+ std::stringstream outputKernel;
+ outputKernel << bestKernelConfig->workItem_output[0] << " "
+ << bestKernelConfig->workItem_output[1] << " "
+ << bestKernelConfig->workItem_output[2] << " "
+ << bestKernelConfig->kernelType << " "
+ << bestKernelConfig->local_work_size[0] << " "
+ << bestKernelConfig->local_work_size[1] << " "
+ << bestKernelConfig->local_work_size[2] << " "
+ << bestKernelConfig->swizzle_weights << " "
+ << bestKernelConfig->use_null_local << " ";
+ kernelConfigMap.insert(std::pair<std::string, std::string>(key_, outputKernel.str()));
+ }
+}
+
+template<>
+void OCL4DNNConvSpatial<float>::setupConvolution(const UMat &bottom,
+ UMat &top,
+ const UMat &weight,
+ const UMat &bias,
+ int32_t numImages,
+ UMat &verifyTop)
+{
+ std::vector< cv::Ptr<tunerParam> > tunerItems;
+
+ generateTunerItems(tunerItems);
+ for (int i = 0; i < tunerItems.size(); i++)
+ createConvolutionKernel(tunerItems[i]->kernelType,
+ tunerItems[i]->blockWidth,
+ tunerItems[i]->blockHeight,
+ tunerItems[i]->blockDepth);
+
+ for (int32_t x = 0; x < kernelQueue.size(); x++) {
+ kernelQueue[x]->executionTime = timedConvolve(bottom, top, weight, bias, numImages,
+ kernelQueue[x]);
+ #ifdef TEST_ALL_KERNELS
+ if (kernelQueue[x]->tested == false) {
+ bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[x], verifyTop);
+ if (verified == false) {
+ dbgPrint(std::cout << "Kernel "
+ << kernelQueue[x]->kernelName
+ << " failed verification" << std::endl);
+ dbgPrint(std::cout << "kernelQueue[x]->workItem_output[0]: "
+ << kernelQueue[x]->workItem_output[0] << " "
+ << "kernelQueue[x]->workItem_output[1]: "
+ << kernelQueue[x]->workItem_output[1] << " "
+ << "kernelQueue[x]->workItem_output[2]: "
+ << kernelQueue[x]->workItem_output[2] << " "
+ << "kernelQueue[x]->kernelType: "
+ << kernelQueue[x]->kernelType << " "
+ << "kernelQueue[x]->global_work_size[0]: "
+ << kernelQueue[x]->global_work_size[0] << " "
+ << "kernelQueue[x]->global_work_size[1]: "
+ << kernelQueue[x]->global_work_size[1] << " "
+ << "kernelQueue[x]->global_work_size[2]: "
+ << kernelQueue[x]->global_work_size[2] << " "
+ << "kernelQueue[x]->local_work_size[0]: "
+ << kernelQueue[x]->local_work_size[0] << " "
+ << "kernelQueue[x]->local_work_size[1]: "
+ << kernelQueue[x]->local_work_size[1] << " "
+ << "kernelQueue[x]->local_work_size[2]: "
+ << kernelQueue[x]->local_work_size[2] << " "
+ << kernelQueue[x]->swizzle_weights << " "
+ << kernelQueue[x]->use_null_local << std::endl);
+ } else {
+ dbgPrint(std::cout << "Kernel "
+ << kernelQueue[x]->kernelName
+ << " pass verification" << std::endl);
+ }
+ }
+ #endif
+ }
+ int32_t failures = 0;
+ bool verification = false;
+ if (kernelQueue.size()) {
+ while (failures < kernelQueue.size()) {
+ int32_t fastestKernel = -1;
+ float fastestTime = std::numeric_limits<float>::infinity();
+
+ for (int32_t x = 0; x < kernelQueue.size(); x++) {
+ if (kernelQueue[x]->executionTime < fastestTime &&
+ kernelQueue[x]->tested == false) {
+ fastestKernel = x;
+ fastestTime = kernelQueue[x]->executionTime;
+ }
+ }
+ if (fastestKernel < 0) break;
+ // Test fastest kernel
+ bool verified = verifyResult(bottom, top, weight, bias, numImages, kernelQueue[fastestKernel], verifyTop);
+ if (verified == true) {
+ kernelQueue[fastestKernel]->verified = true;
+ kernel_index_ = fastestKernel;
+ verification = true;
+ break;
+ } else {
+ kernelQueue[fastestKernel]->tested = true;
+ dbgPrint(std::cout << "Kernel " <<
+ kernelQueue[fastestKernel]->kernelName <<
+ " failed verification" << std::endl);
+ failures++;
+ }
+ }
+ }
+ if (verification) {
+ dbgPrint(std::cout << "Kernel <" << kernelQueue[kernel_index_]->kernelName <<
+ "> passed verification" << std::endl);
+ dbgPrint(std::cout << "Convolution Time:" << kernelQueue[kernel_index_]->executionTime << std::endl);
+ } else {
+ dbgPrint(std::cout << "fallback to basic kernel" << std::endl);
+ options_.str(""); options_.clear(); // clear contents and state flags
+ createBasicKernel(1, 1, 1);
+ kernel_index_ = kernelQueue.size() - 1;
+ }
+ this->bestKernelConfig = kernelQueue[kernel_index_];
+
+
+ if (bestKernelConfig->kernelType != KERNEL_TYPE_INTEL_IDLF && bestKernelConfig->kernelType != KERNEL_TYPE_GEMM_LIKE)
+ if (!swizzled_weights_umat.empty())
+ swizzled_weights_umat.release();
+
+ for (int32_t x = 0; x < kernelQueue.size(); x++) {
+ if (x != kernel_index_) {
+ CV_Assert(phash.find(kernelQueue[x]->kernelName) != phash.end());
+ unloadProgram(kernelQueue[x]->kernelName);
+ }
+ }
+ kernelQueue.clear();
+ tuned_ = true;
+ saveTunedConfig();
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::saveTunedConfig()
+{
+ CV_Assert(tuned_);
+ if (!use_cache_path_ || cache_path_.empty())
+ return;
+
+ std::string outputFile;
+ outputFile = cache_path_ + "/" + key_sanitized_;
+ std::ofstream outputKernel;
+ outputKernel.open(outputFile.c_str());
+ outputKernel << bestKernelConfig->workItem_output[0] << " "
+ << bestKernelConfig->workItem_output[1] << " "
+ << bestKernelConfig->workItem_output[2] << " "
+ << bestKernelConfig->kernelType << " "
+ << bestKernelConfig->local_work_size[0] << " "
+ << bestKernelConfig->local_work_size[1] << " "
+ << bestKernelConfig->local_work_size[2] << " "
+ << bestKernelConfig->swizzle_weights << " "
+ << bestKernelConfig->use_null_local << " ";
+ outputKernel.close();
+}
+
+template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
+ const UMat &weight, const UMat &bias,
+ int32_t numImages)
+{
+ std::string previous_key = key_;
+
+ generateKey();
+ if (key_.compare(previous_key) == 0 && bestKernelConfig != NULL)
+ return;
+
+ if (bestKernelConfig)
+ {
+ prev_kernel_type_ = bestKernelConfig->kernelType;
+ CV_Assert(phash.find(bestKernelConfig->kernelName) != phash.end());
+ phash.erase(bestKernelConfig->kernelName);
+ bestKernelConfig.release();
+ }
+
+ if (loadCachedConfig()) // check in-memory cache
+ return;
+ if (loadTunedConfig()) // check external storage
+ return;
+
+ UMat benchData(1, numImages * top_dim_, CV_32FC1);
+ if (force_auto_tuning_)
+ {
+ calculateBenchmark(bottom, benchData, weight, bias, numImages);
+ setupConvolution(bottom, top, weight, bias, numImages, benchData);
+ }
+ else
+ {
+ calculateBenchmark(bottom, benchData, weight, bias, numImages);
+ useFirstAvailable(bottom, top, weight, bias, numImages, benchData);
+ }
+ cacheTunedConfig();
+}
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::loadCachedConfig()
+{
+ cv::AutoLock lock(kernelConfigMutex);
+ if (!defaultConfigLoaded)
+ {
+ const size_t numConfigs = sizeof(default_kernel_config_intel)/sizeof(default_kernel_config_intel[0])/2;
+ for (size_t i = 0; i < numConfigs; i++)
+ {
+ std::pair<std::string, std::string> entry(
+ std::string("Intel(R) Corporation_") + default_kernel_config_intel[2 * i],
+ default_kernel_config_intel[2 * i + 1]);
+ kernelConfigMap.insert(entry);
+ }
+ defaultConfigLoaded = true;
+ }
+
+ kernel_hash_t::iterator it = kernelConfigMap.find(key_);
+ if (it != kernelConfigMap.end())
+ {
+ int32_t x, y, z, type, lx, ly, lz;
+ bool swizzle, nullLocal;
+ std::stringstream cachedKernel(it->second);
+ if (cachedKernel)
+ {
+ cachedKernel >> x;
+ cachedKernel >> y;
+ cachedKernel >> z;
+ cachedKernel >> type;
+ cachedKernel >> lx;
+ cachedKernel >> ly;
+ cachedKernel >> lz;
+ cachedKernel >> swizzle;
+ cachedKernel >> nullLocal;
+ if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
+ tuned_ = true;
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::setupKernelByConfig(int x, int y, int z, int type,
+ int lx, int ly, int lz,
+ bool swizzle, bool nullLocal)
+{
+ if (type == KERNEL_TYPE_INTEL_IDLF)
+ {
+ if (z == 1)
+ z = 16;
+ CHECK_EQ(z == 16 || z == 8, true) << "invalid SIMD size" << std::endl;
+ }
+ kernelQueue.clear();
+ createConvolutionKernel(type, x, y, z);
+ if (kernelQueue.size() != 1) {
+ std::cerr << "Failed setup kernel by config:"
+ << " x = " << x
+ << " y = " << y
+ << " z = " << z
+ << " type = " << type
+ << std::endl;
+ return false;
+ }
+ bestKernelConfig = kernelQueue[0];
+ kernelQueue.clear();
+ bestKernelConfig->local_work_size[0] = lx;
+ bestKernelConfig->local_work_size[1] = ly;
+ bestKernelConfig->local_work_size[2] = lz;
+ bestKernelConfig->swizzle_weights = swizzle;
+ bestKernelConfig->use_null_local = nullLocal;
+ // If kernel type changed to type 2 or 4, we need to reset the swizzled
+ // weights pointer to invalidate the previous swizzled weights data.
+ if (prev_kernel_type_ != bestKernelConfig->kernelType &&
+ (bestKernelConfig->kernelType == KERNEL_TYPE_INTEL_IDLF ||
+ bestKernelConfig->kernelType == KERNEL_TYPE_GEMM_LIKE))
+ {
+ if (!swizzled_weights_umat.empty())
+ swizzled_weights_umat.release();
+ }
+ return true;
+}
+
+template<typename Dtype>
+bool OCL4DNNConvSpatial<Dtype>::loadTunedConfig()
+{
+ if (!use_cache_path_)
+ {
+ if (cache_path_.empty() && !force_auto_tuning_)
+ {
+ static int warn_ = 0;
+ if (!warn_)
+ {
+ std::cout << "OpenCV(ocl4dnn): consider to specify kernel configuration cache directory " << std::endl
+ << " via OPENCV_OCL4DNN_CONFIG_PATH parameter." << std::endl;
+ warn_ = true;
+ }
+ }
+ return false;
+ }
+
+ int32_t x, y, z, type, lx, ly, lz;
+ bool swizzle, nullLocal;
+
+ // Find cached kernel configuration from file
+ std::string cacheFile = cache_path_ + "/" + key_sanitized_;
+ std::ifstream cachedKernel(cacheFile.c_str());
+ if (cachedKernel)
+ {
+ cachedKernel >> x;
+ cachedKernel >> y;
+ cachedKernel >> z;
+ cachedKernel >> type;
+ cachedKernel >> lx;
+ cachedKernel >> ly;
+ cachedKernel >> lz;
+ cachedKernel >> swizzle;
+ cachedKernel >> nullLocal;
+ if (setupKernelByConfig(x, y, z, type, lx, ly, lz, swizzle, nullLocal)) {
+ tuned_ = true;
+ return true;
+ }
+ }
+ return false;
+}
+
+template class OCL4DNNConvSpatial<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "math_functions.hpp"
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+template<typename Dtype>
+OCL4DNNInnerProduct<Dtype>::OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config)
+{
+ bias_term_ = config.bias_term;
+ transpose_ = config.transpose;
+ N_ = num_output_ = config.num_output;
+ M_ = config.M;
+ K_ = config.K;
+ phase_test_ = config.phase_test;
+ image_copied_ = false;
+}
+
+template<typename Dtype>
+OCL4DNNInnerProduct<Dtype>::~OCL4DNNInnerProduct()
+{
+}
+
+template<typename Dtype>
+bool OCL4DNNInnerProduct<Dtype>::Forward(const UMat& bottom,
+ const UMat& weight,
+ const UMat& bias,
+ UMat& top)
+{
+ bool ret;
+
+ if (M_ == 1)
+ {
+ ret = ocl4dnnGEMV<Dtype>(CblasNoTrans, N_, K_, (Dtype) 1.,
+ weight, 0, bottom, 0, (Dtype) 0., top, 0);
+
+ if (bias_term_ && ret)
+ ret = ocl4dnnAXPY<Dtype>(N_, 1, bias, 0, top, 0);
+
+ return ret;
+ }
+ else
+ {
+ ret = false;
+ size_t max_image_size = std::min(ocl::Device::getDefault().image2DMaxWidth(),
+ ocl::Device::getDefault().image2DMaxHeight());
+ if (M_ <= max_image_size &&
+ N_ <= max_image_size &&
+ K_ <= max_image_size &&
+ cv::traits::Depth<Dtype>::value == CV_32F &&
+ ocl::Device::getDefault().intelSubgroupsSupport())
+ {
+ ret = ocl4dnnGEMMCommon<Dtype>(transpose_ ? CblasNoTrans : CblasTrans,
+ M_, N_, K_, bottom, weight, UMat(), top,
+ max_image_size);
+ }
+ return ret;
+ }
+}
+
+template class OCL4DNNInnerProduct<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "opencl_kernels_dnn.hpp"
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+template<typename Dtype>
+OCL4DNNLRN<Dtype>::OCL4DNNLRN(OCL4DNNLRNConfig config)
+{
+ lrn_type_ = config.lrn_type;
+ phase_test_ = config.phase_test;
+ size_ = config.local_size;
+ CHECK_EQ(size_ % 2, 1)<< "LRN only supports odd values for local_size";
+ alpha_ = config.alpha;
+ beta_ = config.beta;
+ k_ = config.k;
+ norm_by_size_ = config.norm_by_size;
+ num_ = config.batch_size;
+ channels_ = config.channels;
+ height_ = config.height;
+ width_ = config.width;
+}
+
+template<typename Dtype>
+bool OCL4DNNLRN<Dtype>::Forward(const UMat& bottom, UMat& top)
+{
+ bool ret = true;
+
+ if (!ocl::Device::getDefault().intelSubgroupsSupport())
+ return false;
+
+ switch (lrn_type_)
+ {
+ case LRNParameter_NormRegion_ACROSS_CHANNELS:
+ ret = crossChannelForward(bottom, top);
+ break;
+ case LRNParameter_NormRegion_WITHIN_CHANNEL:
+ //TODO
+ //WithinChannelForward(bottom_data, top_data);
+ ret = false;
+ break;
+ default:
+ ret = false;
+ LOG(FATAL)<< "Unknown normalization region.";
+ }
+ return ret;
+}
+
+template<typename Dtype>
+bool OCL4DNNLRN<Dtype>::crossChannelForward(const UMat& bottom, UMat& top)
+{
+ ocl::Queue queue = ocl::Queue::getDefault();
+ CHECK_EQ(phase_test_, true) << "Only support forward inference.";
+
+ cl_uint argIdx = 0;
+ int32_t n_threads = num_ * height_ * width_;
+ size_t global_work_size_[1] = {(size_t)n_threads};
+ String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
+ ocl::Kernel oclk_lrn_fill;
+ if (!oclk_lrn_fill.create(CL_KERNEL_SELECT("lrn_full_no_scale"), ocl::dnn::ocl4dnn_lrn_oclsrc, opts))
+ return false;
+
+ oclk_lrn_fill.set(argIdx++, n_threads);
+ oclk_lrn_fill.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+ oclk_lrn_fill.set(argIdx++, num_);
+ oclk_lrn_fill.set(argIdx++, channels_);
+ oclk_lrn_fill.set(argIdx++, height_);
+ oclk_lrn_fill.set(argIdx++, width_);
+ oclk_lrn_fill.set(argIdx++, size_);
+ int size_norm_factor = norm_by_size_ ? size_ : 1;
+ oclk_lrn_fill.set(argIdx++, alpha_ / size_norm_factor);
+ oclk_lrn_fill.set(argIdx++, k_);
+ oclk_lrn_fill.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ oclk_lrn_fill.set(argIdx++, -beta_);
+
+ return oclk_lrn_fill.run(1, global_work_size_, NULL, false);
+}
+
+template class OCL4DNNLRN<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include <string>
+#include <vector>
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "opencl_kernels_dnn.hpp"
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+template<typename Dtype>
+OCL4DNNPool<Dtype>::OCL4DNNPool(OCL4DNNPoolConfig config)
+{
+ int dims = config.in_shape.size();
+ int spatial_dims = 2;
+
+ batch_size_ = config.in_shape[0];
+ channels_ = config.channels;
+ pool_method_ = config.pool_method;
+
+ for (int i = 0; i < spatial_dims; ++i)
+ {
+ kernel_shape_.push_back(i == 0 ? config.kernel.height : config.kernel.width);
+ pad_.push_back(i == 0 ? config.pad.height : config.pad.width);
+ stride_.push_back(i == 0 ? config.stride.height : config.stride.width);
+ im_in_shape_.push_back(config.in_shape[dims - spatial_dims + i]);
+ im_out_shape_.push_back(config.out_shape[dims - spatial_dims + i]);
+ }
+
+ kernel_h_ = kernel_shape_[0];
+ kernel_w_ = kernel_shape_[1];
+ stride_h_ = stride_[0];
+ stride_w_ = stride_[1];
+ pad_h_ = pad_[0];
+ pad_w_ = pad_[1];
+ height_ = im_in_shape_[0];
+ width_ = im_in_shape_[1];
+ pooled_height_ = im_out_shape_[0];
+ pooled_width_ = im_out_shape_[1];
+
+ count_ = 1;
+ for (int i = 0; i < config.out_shape.size(); ++i)
+ {
+ count_ *= config.out_shape[i];
+ }
+}
+
+template<typename Dtype>
+OCL4DNNPool<Dtype>::~OCL4DNNPool()
+{
+ mask_idx_.release();
+}
+
+template<typename Dtype>
+bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
+ UMat& top,
+ UMat& top_mask)
+{
+ bool ret = true;
+ ocl::Queue queue = ocl::Queue::getDefault();
+ size_t global[] = { 128 * 128 };
+ size_t local[] = { 128 };
+ cl_uint argIdx = 0;
+
+ // support 2D case
+ switch (pool_method_)
+ {
+ case LIBDNN_POOLING_METHOD_MAX:
+ {
+ if (top_mask.empty() && mask_idx_.empty())
+ {
+ mask_idx_.create(1, count_, CV_32FC1);
+ }
+ ocl::Kernel oclk_max_pool_forward(CL_KERNEL_SELECT("max_pool_forward"),
+ cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+
+ if (oclk_max_pool_forward.empty())
+ return false;
+
+ argIdx = 0;
+ oclk_max_pool_forward.set(argIdx++, count_);
+ oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+ oclk_max_pool_forward.set(argIdx++, batch_size_);
+ oclk_max_pool_forward.set(argIdx++, channels_);
+ oclk_max_pool_forward.set(argIdx++, height_);
+ oclk_max_pool_forward.set(argIdx++, width_);
+ oclk_max_pool_forward.set(argIdx++, pooled_height_);
+ oclk_max_pool_forward.set(argIdx++, pooled_width_);
+ oclk_max_pool_forward.set(argIdx++, kernel_h_);
+ oclk_max_pool_forward.set(argIdx++, kernel_w_);
+ oclk_max_pool_forward.set(argIdx++, stride_h_);
+ oclk_max_pool_forward.set(argIdx++, stride_w_);
+ oclk_max_pool_forward.set(argIdx++, pad_h_);
+ oclk_max_pool_forward.set(argIdx++, pad_w_);
+ oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ oclk_max_pool_forward.set(argIdx++, mask_idx_.empty() ? 0 : 1);
+ if (mask_idx_.empty())
+ oclk_max_pool_forward.set(argIdx++, (void *)NULL);
+ else
+ oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(mask_idx_));
+ oclk_max_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top_mask));
+
+ ret = oclk_max_pool_forward.run(1, global, local, false);
+ }
+ break;
+ case LIBDNN_POOLING_METHOD_AVE:
+ {
+ ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
+ cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+
+ if (oclk_ave_pool_forward.empty())
+ return false;
+
+ argIdx = 0;
+ oclk_ave_pool_forward.set(argIdx++, count_);
+ oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+ oclk_ave_pool_forward.set(argIdx++, batch_size_);
+ oclk_ave_pool_forward.set(argIdx++, channels_);
+ oclk_ave_pool_forward.set(argIdx++, height_);
+ oclk_ave_pool_forward.set(argIdx++, width_);
+ oclk_ave_pool_forward.set(argIdx++, pooled_height_);
+ oclk_ave_pool_forward.set(argIdx++, pooled_width_);
+ oclk_ave_pool_forward.set(argIdx++, kernel_h_);
+ oclk_ave_pool_forward.set(argIdx++, kernel_w_);
+ oclk_ave_pool_forward.set(argIdx++, stride_h_);
+ oclk_ave_pool_forward.set(argIdx++, stride_w_);
+ oclk_ave_pool_forward.set(argIdx++, pad_h_);
+ oclk_ave_pool_forward.set(argIdx++, pad_w_);
+ oclk_ave_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+
+ ret = oclk_ave_pool_forward.run(1, global, local, false);
+ }
+ break;
+ case LIBDNN_POOLING_METHOD_STO:
+ {
+ ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
+ cv::ocl::dnn::ocl4dnn_pooling_oclsrc);
+
+ if (oclk_sto_pool_forward.empty())
+ return false;
+
+ argIdx = 0;
+ oclk_sto_pool_forward.set(argIdx++, count_);
+ oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+ oclk_sto_pool_forward.set(argIdx++, batch_size_);
+ oclk_sto_pool_forward.set(argIdx++, channels_);
+ oclk_sto_pool_forward.set(argIdx++, height_);
+ oclk_sto_pool_forward.set(argIdx++, width_);
+ oclk_sto_pool_forward.set(argIdx++, pooled_height_);
+ oclk_sto_pool_forward.set(argIdx++, pooled_width_);
+ oclk_sto_pool_forward.set(argIdx++, kernel_h_);
+ oclk_sto_pool_forward.set(argIdx++, kernel_w_);
+ oclk_sto_pool_forward.set(argIdx++, stride_h_);
+ oclk_sto_pool_forward.set(argIdx++, stride_w_);
+ oclk_sto_pool_forward.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+
+ ret = oclk_sto_pool_forward.run(1, global, local, false);
+ }
+ break;
+ default:
+ {
+ ret = false;
+ LOG(FATAL)<< "Unknown pooling method.";
+ }
+ }
+ return ret;
+}
+
+template class OCL4DNNPool<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../../precomp.hpp"
+#include <vector>
+#include "common.hpp"
+#include "ocl4dnn.hpp"
+#include "opencl_kernels_dnn.hpp"
+
+#ifdef HAVE_OPENCL
+namespace cv { namespace dnn { namespace ocl4dnn {
+template<typename Dtype>
+OCL4DNNSoftmax<Dtype>::OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config)
+{
+ softmax_axis_ = config.axis;
+ channels_ = config.channels;
+
+ inner_num_ = 1;
+ outer_num_ = 1;
+ count_ = 1;
+ int32_t scale_sz = 1;
+ for (int32_t i = softmax_axis_ + 1; i < config.in_shape.size(); i++)
+ inner_num_ *= config.in_shape[i];
+ use_slm_ = (config.in_shape[softmax_axis_] * inner_num_ + inner_num_ * 17) <= 8192;
+ for (int32_t i = 0; i < softmax_axis_; i++)
+ outer_num_ *= config.in_shape[i];
+ count_ = inner_num_ + outer_num_;
+
+ std::vector<int32_t> scale_dims = config.in_shape;
+ scale_dims[softmax_axis_] = use_slm_ ? 1 : 17;
+ for (int32_t i = 0; i < scale_dims.size(); i++)
+ scale_sz *= scale_dims[i];
+
+ scale_data_.create(1, scale_sz, CV_32FC1);
+}
+
+template<typename Dtype>
+OCL4DNNSoftmax<Dtype>::~OCL4DNNSoftmax()
+{
+ scale_data_.release();
+}
+
+template<typename Dtype>
+bool OCL4DNNSoftmax<Dtype>::Forward(const UMat& bottom, UMat& top)
+{
+ bool ret = false;
+ ocl::Queue queue = ocl::Queue::getDefault();
+ bool intel_subgroup = ocl::Device::getDefault().intelSubgroupsSupport();
+ if (intel_subgroup && inner_num_ < 128)
+ {
+ String opts = clOptionSupport("-cl-no-subgroup-ifp") ? " -cl-no-subgroup-ifp " : "";
+ String kname;
+ ocl::Kernel oclk_softmax_forward_kernel;
+
+ if (use_slm_)
+ kname = CL_KERNEL_SELECT("softmax_forward_slm");
+ else
+ kname = CL_KERNEL_SELECT("softmax_forward");
+
+ if (!oclk_softmax_forward_kernel.create(kname.c_str(), ocl::dnn::softmax_loss_oclsrc, opts))
+ return false;
+
+ size_t global_size[] = { 256, (size_t)outer_num_, 1 };
+ size_t local_size[] = { 256, 1, 1 };
+ cl_uint argIdx = 0;
+
+ if (use_slm_)
+ {
+ oclk_softmax_forward_kernel.set(argIdx++, outer_num_);
+ oclk_softmax_forward_kernel.set(argIdx++, channels_);
+ oclk_softmax_forward_kernel.set(argIdx++, inner_num_);
+ oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(scale_data_));
+ oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+ oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ oclk_softmax_forward_kernel.set(argIdx++, NULL, channels_ * inner_num_* sizeof(Dtype));
+ oclk_softmax_forward_kernel.set(argIdx++, NULL, inner_num_* sizeof(Dtype));
+ oclk_softmax_forward_kernel.set(argIdx++, NULL, 16 * inner_num_* sizeof(Dtype));
+ }
+ else
+ {
+ oclk_softmax_forward_kernel.set(argIdx++, outer_num_);
+ oclk_softmax_forward_kernel.set(argIdx++, channels_);
+ oclk_softmax_forward_kernel.set(argIdx++, inner_num_);
+ oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(scale_data_));
+ oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrReadOnly(bottom));
+ oclk_softmax_forward_kernel.set(argIdx++, ocl::KernelArg::PtrWriteOnly(top));
+ }
+ ret = oclk_softmax_forward_kernel.run(3, global_size, local_size, false);
+ }
+ return ret;
+}
+
+template class OCL4DNNSoftmax<float>;
+} // namespace ocl4dnn
+}
+}
+#endif // HAVE_OPENCL
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
__kernel void ReLUForward(const int count, __global const T* in, __global T* out
#ifndef RELU_NO_SLOPE
, T negative_slope
--- /dev/null
+
+__kernel void batchnorm(__global const T *src, int src_offset,
+ __global const float *meanMat,
+ float varMeanScale,
+ __global const float *invStdMat,
+ __global const float *weight,
+ __global const float *bias,
+ int hasWeight, int hasBias,
+ int width, int height, int channel,
+ __global T *dst, int dst_offset)
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int c = get_global_id(2);
+
+ if (x >= width || y >= height || c >= channel)
+ return;
+
+ float mean = meanMat[c] * varMeanScale;
+ float invstd = invStdMat[c];
+ float w = hasWeight ? weight[c] : 1;
+ float b = hasBias ? bias[c] : 0;
+ int index = y * width + x + c * width * height;
+ T val = (src[index + src_offset] - mean) * w * invstd + b;
+ dst[index + dst_offset] = val;
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void null_kernel_float(float arg) {
+ float out = arg;
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void concat(const int nthreads,
+ __global const Dtype* in_data,
+ const int num_concats,
+ const int concat_size,
+ const int top_concat_axis,
+ const int bottom_concat_axis,
+ const int offset_concat_axis,
+ __global Dtype* out_data) {
+
+ for (int index = get_global_id(0); index < nthreads;
+ index += get_global_size(0)) {
+ const int total_concat_size = concat_size * bottom_concat_axis;
+ const int concat_num = index / total_concat_size;
+ const int concat_index = index % total_concat_size;
+ const int top_index = concat_index
+ + (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+ out_data[top_index] = in_data[index];
+ }
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if APPLY_BIAS
+#define BIAS_KERNEL_ARG __global Dtype * biases_base,
+#else
+#define BIAS_KERNEL_ARG
+#endif
+
+#define ACTIVATION_FUNCTION(_dst_, _offset_, _data_) do { (_dst_)[(_offset_)] = (_data_);} while(0)
+
+#define __CAT(x, y) x##y
+#define CAT(x, y) __CAT(x, y)
+#define LOOP0(VAR, STMT)
+#define LOOP1(VAR, STMT) (STMT); (VAR)++;
+#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
+#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
+#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
+#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
+#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
+#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
+#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
+#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
+#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
+#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
+#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
+#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
+#define LOOP14(VAR, STMT) LOOP13(VAR, STMT); (STMT); (VAR)++;
+#define LOOP15(VAR, STMT) LOOP14(VAR, STMT); (STMT); (VAR)++;
+#define LOOP16(VAR, STMT) LOOP15(VAR, STMT); (STMT); (VAR)++;
+#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
+
+#if defined(convolve_simd) || defined(Conv_Interleaved)
+#if Dtype_SIZE == 4
+#define INT_TYPE uint
+#define INT_TYPE2 uint2
+#define INT_TYPE4 uint4
+#define INT_TYPE8 uint8
+#define SUB_GROUP_BLOCK_READ2 intel_sub_group_block_read2
+#define SUB_GROUP_BLOCK_READ4 intel_sub_group_block_read4
+#define SUB_GROUP_BLOCK_READ8 intel_sub_group_block_read8
+#define SUB_GROUP_BLOCK_READ intel_sub_group_block_read
+#else
+#error "Unsupported type"
+#endif
+#endif
+
+#ifdef KERNEL_BASIC
+
+__kernel void ConvolveBasic(
+ __global Dtype* image_data,
+ int image_offset,
+ __global Dtype* kernel_data,
+ int kernel_offset,
+ __global Dtype* bias,
+ const int bias_offset,
+ __global Dtype* convolved_image,
+ const int convolved_image_offset,
+ const ushort input_width,
+ const ushort input_height,
+ const ushort output_width,
+ const ushort output_height,
+ const ushort pad_w,
+ const ushort pad_h
+)
+{
+ const int outputX = get_global_id(0);
+ const int outputY = get_global_id(1);
+ const int kernelNum = get_global_id(2) * ZPAR;
+ if (outputX < output_width && outputY < output_height)
+ {
+ Dtype sum[ZPAR];
+ for (int kern = 0; kern < ZPAR; kern++)
+ {
+ sum[kern] = 0.0f;
+ }
+ const int org_y = outputY * STRIDE_Y - pad_h;
+ const int org_x = outputX * STRIDE_X - pad_w;
+ const int currentKernelOffset = kernel_offset + kernelNum*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS;
+#if APPLY_BIAS
+ const int biasIndex = bias_offset + kernelNum;
+#endif
+ const int local_image_offset = org_y * input_width + org_x;
+ const int imageSize = input_width * input_height;
+ __global Dtype* image_dataPtr = (image_data + (image_offset + local_image_offset));
+ __global Dtype* kernel_dataPtr = (kernel_data + (currentKernelOffset));
+ for (int c = 0; c < CHANNELS; c++)
+ {
+ for (int y = 0; y < KERNEL_HEIGHT; y++)
+ {
+ for (int x = 0; x < KERNEL_WIDTH; x++)
+ {
+ int y_ = org_y + y * DILATION_Y;
+ int x_ = org_x + x * DILATION_X;
+ if (!(y_ >= 0 && y_ < input_height && x_ >= 0 && x_ < input_width))
+ {
+ continue;
+ }
+ for (int kern = 0; kern < ZPAR; kern++)
+ {
+ sum[kern] += image_dataPtr[x * DILATION_X] * kernel_dataPtr[kern*KERNEL_HEIGHT*KERNEL_WIDTH*CHANNELS + x];
+ }
+ }
+ image_dataPtr += input_width * DILATION_Y;
+ kernel_dataPtr += KERNEL_WIDTH;
+ }
+ image_dataPtr += imageSize - input_width*KERNEL_HEIGHT*DILATION_Y;
+ }
+
+ for (int kern = 0; kern < ZPAR; kern++)
+ {
+ if (kernelNum + kern < OUTPUT_Z)
+ {
+ int offset = convolved_image_offset + (kernelNum+kern)*output_height*output_width + outputY*output_width + outputX;
+#if APPLY_BIAS
+ ACTIVATION_FUNCTION(convolved_image, offset, sum[kern] + bias[biasIndex + kern]);
+#else
+ ACTIVATION_FUNCTION(convolved_image, offset, sum[kern]);
+#endif
+ }
+ }
+ }
+}
+
+#elif defined KERNEL_IDLF
+
+#if TYPE == TYPE_HALF
+#define VLOAD4(_v, _p) do { (_v).s0 = *(_p); (_v).s1 = *(_p + 1); (_v).s2 = *(_p + 2); (_v).s3 = *(_p + 3); } while(0)
+#else
+#define VLOAD4(_v, _p) do { _v = vload4(0, _p); } while(0)
+#endif
+
+// Each work-item computes a OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT region of one output map.
+// Each work-group (which will be mapped to 1 SIMD16/SIMD8 EU thread) will compute 16/8 different feature maps, but each feature map is for the same region of the imput image.
+// NDRange: (output_width+pad)/ OUT_BLOCK_WIDTH, (output_height+pad)/OUT_BLOCK_HEIGHT, NUM_FILTERS/OUT_BLOCK_DEPTH
+
+// NOTE: for beignet this reqd_work_group_size does not guarantee that SIMD16 mode will be used, the compiler could choose to use two SIMD8 threads, and if that happens the code will break.
+#ifndef __BEIGNET__
+__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
+#endif
+__kernel void
+convolve_simd(
+ __global Dtype* inputs_base,
+ filter_qualifier Dtype* weights_base,
+ BIAS_KERNEL_ARG
+ __global Dtype* outputs_base,
+ const ushort input_width,
+ const ushort input_height,
+ const ushort output_width,
+ const ushort output_height)
+{
+ __global Dtype* outputs = outputs_base;
+ __global Dtype* inputs = inputs_base;
+ filter_qualifier Dtype* weights = weights_base;
+ unsigned int oc = get_global_id(0) * OUT_BLOCK_WIDTH; // oc = Output Column
+ unsigned int or = get_global_id(1) * OUT_BLOCK_HEIGHT;// or = Output Row
+ unsigned int fm = get_global_id(2);// fm = Feature Map = od = Output Depth
+ unsigned int fmg = get_group_id(2);
+ unsigned int lid = get_local_id(2);
+
+ Dtype out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT];
+
+ int in_addr;
+
+ // find weights adress of given neuron (lid is index)
+ unsigned int weight_addr = (fmg % (ALIGNED_NUM_FILTERS/SIMD_SIZE)) * INPUT_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE + lid;
+
+ for(int i=0;i<OUT_BLOCK_SIZE;i++) {
+ out[i]=0.0f;
+ }
+
+ unsigned int num_in_batch = ( fm ) / ALIGNED_NUM_FILTERS;
+
+ unsigned int input_batch_offset = num_in_batch * input_height * input_width * TOTAL_INPUT_DEPTH_SIZE;
+
+ int curr_local_y = ( lid / ( TILE_X / 4 ) );
+ int curr_local_x = ( lid % ( TILE_X / 4 ) ) * 4;
+ int curr_y = or * STRIDE_Y + INPUT_START_Y + curr_local_y;
+ int curr_x = oc * STRIDE_X + INPUT_START_X + curr_local_x;
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
+ int saved_y = curr_y;
+#endif
+ in_addr = input_batch_offset + INPUT_START_Z * input_height * input_width
+ + (curr_y - INPUT_PAD_H) * input_width // y tile offset
+ + curr_x - INPUT_PAD_W; // x tile offset
+ union {
+ Dtype4 in_vec[INVEC_SIZE];
+ Dtype in_array[INVEC_SIZE * 4];
+ } in_buf;
+
+ for(int kd = 0; kd < INPUT_DEPTH; kd++)
+ {
+ int in_offset = in_addr;
+ int reg = 0;
+ LOOP(INVEC_SIZE, reg,
+ {
+ if (curr_local_y + reg * TILE_Y_STRIDE < TILE_Y || INVEC_SIZE * TILE_Y_STRIDE <= (TILE_Y + 2) || reg < INVEC_SIZE - 1) {
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
+ if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + 3 >= INPUT_PAD_W && curr_x < input_width + INPUT_PAD_W) {
+ if (curr_x < INPUT_PAD_W) {
+ in_buf.in_vec[reg].s0 = 0;
+ if (curr_x + 1 >= INPUT_PAD_W)
+ in_buf.in_vec[reg].s1 = *(inputs + in_offset + 1);
+ else
+ in_buf.in_vec[reg].s1 = 0;
+ if (curr_x + 2 >= INPUT_PAD_W)
+ in_buf.in_vec[reg].s2 = *(inputs + in_offset + 2);
+ else
+ in_buf.in_vec[reg].s2 = 0;
+ in_buf.in_vec[reg].s3 = *(inputs + in_offset + 3);
+ } else {
+ VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
+ if (curr_x + 1 >= input_width + INPUT_PAD_W)
+ in_buf.in_vec[reg].s1 = 0;
+ if (curr_x + 2 >= input_width + INPUT_PAD_W)
+ in_buf.in_vec[reg].s2 = 0;
+ if (curr_x + 3 >= input_width + INPUT_PAD_W)
+ in_buf.in_vec[reg].s3 = 0;
+ }
+ } else {
+ in_buf.in_vec[reg] = 0;
+ }
+ curr_y += TILE_Y_STRIDE;
+#else
+ VLOAD4(in_buf.in_vec[reg], inputs + in_offset);
+#endif
+ }
+ in_offset += input_width * TILE_Y_STRIDE;
+ });
+ in_addr += input_height * input_width;
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0
+ curr_y = saved_y;
+#endif
+
+#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
+#define WEIGHT_PREF 8
+#else
+#define WEIGHT_PREF 1
+#endif
+ union {
+ Dtype w[WEIGHT_PREF];
+#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
+ INT_TYPE8 ui8;
+#endif
+ } weight_buf;
+ int w_idx=0;
+
+ unsigned int orig_weight_addr = weight_addr;
+#if KERNEL_WIDTH * KERNEL_HEIGHT != 1
+ weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
+ weight_addr += SIMD_SIZE * WEIGHT_PREF;
+#else
+ weight_buf.w[0] = as_Dtype(SUB_GROUP_BLOCK_READ((__global INT_TYPE *)&weights[weight_addr]));
+ weight_addr += SIMD_SIZE * 1;
+#endif
+
+#define BLOCK_IN(n) sub_group_broadcast( in_buf.in_array[((n)%4) + ((n) / (TILE_Y_STRIDE * TILE_X)) * 4], (((n) % (TILE_Y_STRIDE * TILE_X))/4))
+
+ int kr = 0; // kr = Kernel Row
+ LOOP(KERNEL_HEIGHT, kr,// LOOP is a macro that unrolls the loop.
+ {
+ int kc = 0; // kc = Kernel Column
+ LOOP(KERNEL_WIDTH, kc,
+ {
+ for(int br=0; br < OUT_BLOCK_HEIGHT; br++) {
+ for(int bc=0; bc < OUT_BLOCK_WIDTH; bc++) {
+ Dtype input = BLOCK_IN((br * STRIDE_Y + kr * DILATION_Y) * TILE_X + bc * STRIDE_X + kc * DILATION_X);
+ out[br * OUT_BLOCK_WIDTH + bc] = mad(weight_buf.w[w_idx % WEIGHT_PREF], input, out[br * OUT_BLOCK_WIDTH + bc]);
+ }
+ }
+#if KERNEL_WIDTH * KERNEL_HEIGHT > WEIGHT_PREF
+ // We assume KERNEL_W is equal to KERNEL_H here.
+ if ((w_idx + 1) % WEIGHT_PREF == 0
+ #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 != 0
+ && ((w_idx + 1) <= (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF))
+ #endif
+ ) {
+ weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
+ weight_addr += SIMD_SIZE * WEIGHT_PREF; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
+ }
+ #if KERNEL_WIDTH*KERNEL_HEIGHT % 8 == 0
+ // need to do nothing
+ #else
+ else if ((w_idx + 1) % WEIGHT_PREF == 0 && ((w_idx + 1) > (KERNEL_WIDTH * KERNEL_HEIGHT - WEIGHT_PREF)))
+ #if KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 1
+ weight_buf.w[0] = weights[weight_addr];
+ #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 == 2
+ weight_buf.ui8.s01 = SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)&weights[weight_addr]);
+ #elif KERNEL_WIDTH * KERNEL_HEIGHT % 8 <= 4
+ weight_buf.ui8.s0123 = SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)&weights[weight_addr]);
+ #else
+ weight_buf.ui8 = SUB_GROUP_BLOCK_READ8((__global INT_TYPE *)&weights[weight_addr]);
+ #endif
+ #endif
+#endif
+ ++w_idx;
+ });
+ });
+ weight_addr = orig_weight_addr + KERNEL_WIDTH * KERNEL_HEIGHT * SIMD_SIZE;
+
+ }
+ // dead code to work around possible compiler bug.
+ if (ALIGNED_NUM_FILTERS != NUM_FILTERS && fm > 0xfffffffeul) {
+ outputs[0] = BLOCK_IN(fm % SIMD_SIZE);
+ }
+ fm = fm % ALIGNED_NUM_FILTERS;
+
+ if ((ALIGNED_NUM_FILTERS == NUM_FILTERS || fm < NUM_FILTERS)) {
+ unsigned int out_addr = OUT_BUFF_OFFSET + ( num_in_batch * TOTAL_OUTPUT_DEPTH + fm ) * output_width * output_height;
+ out_addr += or * output_width + oc;
+ // we need this address calculation for biases because we support views and batching
+#if APPLY_BIAS
+ Dtype bias = biases_base[fm];
+#else
+ Dtype bias = 0;
+#endif
+ for(unsigned int r = 0; r < OUT_BLOCK_HEIGHT; r++) {
+ if (r + or >= output_height) break;
+ for(unsigned int c = 0; c < OUT_BLOCK_WIDTH; c++) {
+ if (c + oc >= output_width) break;
+ // this does a scattered write to SIMD_SIZE different feature maps, so that data within one map is contiguous, thus ready for input to next layer.
+ ACTIVATION_FUNCTION(outputs, out_addr + r * output_width + c, bias + out[r * OUT_BLOCK_WIDTH + c]);
+
+ }
+ }
+ }
+}
+
+#else // KERNEL_GEMM_LIKE
+
+#if APPLY_BIAS
+// Dtype bias[4];
+#define SUBGROUP_GET_BIAS(k, i) intel_sub_group_shuffle(bias[k], i)
+#else
+#define SUBGROUP_GET_BIAS(k, i) ((Dtype)0)
+#endif
+
+#ifdef Conv_Interleaved
+typedef struct float1 { float s0; } float1;
+typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5;
+typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6;
+typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7;
+typedef struct float9 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; float s7; float s8; } float9;
+typedef struct float10 { float s0; float s1; float s2; float s3; float s4; float s5;
+ float s6; float s7; float s8; float s9;} float10;
+typedef struct float11 { float s0; float s1; float s2; float s3; float s4; float s5;
+ float s6; float s7; float s8; float s9; float sa;} float11;
+typedef struct float12 { float s0; float s1; float s2; float s3; float s4; float s5;
+ float s6; float s7; float s8; float s9; float sa; float sb; } float12;
+typedef struct float13 { float s0; float s1; float s2; float s3; float s4; float s5;
+ float s6; float s7; float s8; float s9; float sa; float sb; float sc;} float13;
+typedef struct float14 { float s0; float s1; float s2; float s3; float s4; float s5;
+ float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; } float14;
+typedef struct float15 { float s0; float s1; float s2; float s3; float s4; float s5;
+ float s6; float s7; float s8; float s9; float sa; float sb; float sc; float sd; float se; } float15;
+typedef struct float0 { float s0; } float0; //never used but makes compiler happy.
+
+#define OUT_PITCH_X output_width
+#define ROW_PITCH input_width
+
+#define GEMM_LIKE_KERNEL_ARGS \
+ const __global Dtype *src0, \
+ const __global Dtype *src1, \
+ BIAS_KERNEL_ARG \
+ __global Dtype *dst, \
+ const ushort input_width, \
+ const ushort input_height, \
+ const ushort output_width, \
+ const ushort output_height, \
+ const int out_pitch_y, \
+ const int out_pitch_z, \
+ const int aligned_input_size, \
+ const int slice_pitch
+#endif
+
+#ifdef GEMM_LIKE_CONV_32_1
+//////////////////////////////////////////////////////////////////////////////
+// Conv_Interleaved_32_1_flex
+//
+// Convolution: each workitem computes 1 patch x 32 filters worth of output
+// data. Kernel's inner loop works on a single tile consisting of one
+// row from each patch and the filter data corresponding to that row. Filter
+// matrix is interleaved to reduce GRF bank conflicts. Patches are walked
+// by rows and then by slices. Relies on sub_group extension for block
+// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N)
+// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
+// the other uses TILE_N = 8, 16, or 24.
+#define TILE_M 1
+#define TILE_K KERNEL_WIDTH
+#define TILE_N 32
+
+#ifndef __BEIGNET__
+__attribute__((intel_reqd_sub_group_size(8)))
+#endif
+__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
+{
+ const int group_x = get_group_id(0);
+ const int group_y = get_group_id(1);
+ const int global_x = get_global_id(0);
+ const int global_y = get_global_id(1);
+ const int global_z = get_global_id(2);
+ int interleaved_y;
+ int kernel_y;
+ int kernel_idx;
+
+#define DOT_PRODUCT_8( _result, _rowA, colB ) \
+ { \
+ _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
+ _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
+ _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
+ _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
+ _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
+ _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
+ _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
+ _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
+ }
+ typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
+
+ // True for all threads if filter_width is multiple of TILE_N
+ // else, true for all but right-most column of threads.
+ if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
+ {
+ // Result ctile (*dst) is M rows x N columns
+ // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
+ Dtype8 blockC00 = 0.f;
+ Dtype8 blockC10 = 0.f;
+ Dtype8 blockC20 = 0.f;
+ Dtype8 blockC30 = 0.f;
+
+ // Src0 (patch input) is directly used as atile.
+ // Each work item points to the start of a different patch.
+ // atile is M rows x K columns.
+ int curr_x = ( global_y % output_width ) * STRIDE_X;
+ int curr_y = ( global_y / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+ int saved_y = curr_y;
+#endif
+ const __global Dtype *src0_read = src0
+ + aligned_input_size * global_z // batch offset
+ + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset
+ + (curr_x - INPUT_PAD_W); // x offset
+
+ // Src1 (filter) is directly used as btile.
+ // It starts at the top of src1 and walks down.
+ // btile is K rows x N columns.
+ const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
+
+ // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+ // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+ // and KERNEL_WIDTH/2 rows of interleaved filter.
+ int patch_depth = 0;
+ do
+ {
+ int patch_row = 0;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+ curr_y = saved_y;
+#endif
+
+ do
+ {
+ // Load atile and btile.
+ // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.
+ // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
+ // interleaved row is padded with zero to ensure same size as interleaved rows. This
+ // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
+ // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
+ // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
+ // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
+ // (0, 2) (8, 2) (16, 2) (24, 2) ... ...
+ // ...
+ const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+
+#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
+ Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+#else
+ Dtype_t blockA00;
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+ int pos = 0;
+ LOOP(KERNEL_WIDTH, pos,
+ {
+ if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
+ pblockA00[pos] = src0_read[pos * DILATION_X];
+ else
+ pblockA00[pos] = 0;
+ })
+ curr_y += DILATION_Y;
+#endif
+ src0_read += (ROW_PITCH * DILATION_Y);
+
+ Dtype blockB00[KERNEL_WIDTH*4];
+ Dtype8* p8BlockB00 = (Dtype8*)blockB00;
+ Dtype4* p4BlockB00 = (Dtype4*)blockB00;
+ Dtype* pBlockB00 = (Dtype* )blockB00;
+
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+ p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE *)src1_read ) );
+ src1_read += WIDTH1 * 2;
+ } )
+ if ( kernel_width_is_odd )
+ {
+ p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE *)src1_read ) );
+ src1_read += WIDTH1 * 2;
+ }
+
+ // Perform MADs
+ kernel_idx = 0;
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+ kernel_y = interleaved_y * 2;
+ DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ } )
+ kernel_y = interleaved_y * 2;
+ if ( kernel_width_is_odd )
+ {
+ DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ }
+ }
+
+ //while( ++patch_row < 1 ); //debug
+ while( ++patch_row < KERNEL_HEIGHT );
+
+ src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y); // reset to start of next slice of patch
+ }
+ //while ( ++patch_depth < 1 ); //debug
+ while ( ++patch_depth < INPUT_DEPTH );
+
+ // Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
+ // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
+ int out_offset = global_z * out_pitch_z // batch offset
+ + ( group_x * TILE_N ) * out_pitch_y // channel offset
+ + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset
+ + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
+
+ __global Dtype *out = dst + out_offset;
+#if APPLY_BIAS
+ Dtype bias[4];
+ Dtype4 *bias_vec;
+ bias_vec = (Dtype4*)bias;
+ *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+#ifdef FUSED_CONV_CHANNEL_RELU
+ Dtype slope[4];
+ Dtype4 *slope_vec;
+ slope_vec = (Dtype4*)slope;
+ *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+ Dtype negative_slope;
+#endif
+ if (global_y * TILE_M < output_width * output_height )
+ {
+ for (int i = 0; i < 8; i++)
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out_offset + ( 0 + i ) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i));
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out_offset + ( 8 + i ) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out_offset + ( 16 + i ) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out_offset + ( 24 + i ) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i));
+ }
+ }
+ }
+#if TILE_N_LAST > 0
+ else
+ {
+
+ // Result ctile (*dst) is M rows x N columns
+ // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
+ int i = 0;
+ Dtype8 blockC[TILE_N_LAST_DIV8];
+ LOOP(TILE_N_LAST_DIV8, i,
+ {
+ blockC[i] = 0.f;
+ } )
+
+ // Src0 (patch input) is directly used as atile.
+ // Each work item points to the start of a different patch.
+ // atile is M rows x K columns.
+ int curr_x = ( global_y % output_width ) * STRIDE_X;
+ int curr_y = ( global_y / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+ int saved_y = curr_y;
+#endif
+ const __global Dtype *src0_read = src0
+ + aligned_input_size * global_z // batch offset
+ + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset
+ + (curr_x - INPUT_PAD_W); // x offset
+
+ // Src1 (filter) is directly used as btile.
+ // It starts at the top of src1 and walks down.
+ // btile is K rows x N columns.
+ const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
+
+ // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+ // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+ // and KERNEL_WIDTH/2 rows of interleaved filter.
+ int patch_depth = 0;
+ do
+ {
+ int patch_row = 0;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+ curr_y = saved_y;
+#endif
+ do
+ {
+ // Load atile and interleaved btile.
+ const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
+ Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+#else
+ Dtype_t blockA00;
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+ int pos = 0;
+ LOOP(KERNEL_WIDTH, pos,
+ {
+ if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
+ pblockA00[pos] = src0_read[pos * DILATION_X];
+ else
+ pblockA00[pos] = 0;
+ })
+ curr_y += DILATION_Y;
+#endif
+ src0_read += (ROW_PITCH * DILATION_Y);
+ Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
+
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+#if TILE_N_LAST_DIV8 == 1
+ Dtype2* p2BlockB = (Dtype2* )blockB;
+ p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 2
+ Dtype4* p4BlockB = (Dtype4* )blockB;
+ p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 3
+ //TODO: broken. No block_read6
+ Dtype6* p6BlockB = (Dtype6* )blockB;
+ (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+ (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
+#endif
+ src1_read += WIDTH1 * 2;
+ } )
+ if ( kernel_width_is_odd )
+ {
+#if TILE_N_LAST_DIV8 == 1
+ Dtype* pBlockB = (Dtype* )blockB;
+ pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 2
+ Dtype2* p2BlockB = (Dtype2* )blockB;
+ p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 3
+ Dtype3* p3BlockB = (Dtype3* )blockB;
+ p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+ p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 2 * 8) ) );
+#endif
+ src1_read += WIDTH1 * 2;
+ }
+
+ // Perform MADs
+ Dtype* pBlockB = (Dtype*)blockB;
+ kernel_idx = 0;
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+ kernel_y = interleaved_y * 2;
+ DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 2
+ DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 3
+ DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#endif
+#endif
+ } )
+ kernel_y = interleaved_y * 2;
+ if ( kernel_width_is_odd )
+ {
+ DOT_PRODUCT_8( blockC[0], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 2
+ DOT_PRODUCT_8( blockC[1], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 3
+ DOT_PRODUCT_8( blockC[2], pblockA00[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#endif
+#endif
+ }
+ }
+
+ //while( ++patch_row < 1 ); //debug
+ while( ++patch_row < KERNEL_HEIGHT );
+
+ src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+ }
+ //while ( ++patch_depth < 1 ); //debug
+ while ( ++patch_depth < INPUT_DEPTH );
+
+ // Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
+ // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
+ int out_offset = global_z * out_pitch_z // batch offset
+ + ( group_x * TILE_N ) * out_pitch_y // channel offset
+ + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset
+ + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ __global Dtype *out = dst + out_offset;
+#if APPLY_BIAS
+ Dtype bias[4];
+ Dtype4 *bias_vec;
+ bias_vec = (Dtype4*)bias;
+ *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+ Dtype slope[4];
+ Dtype4 *slope_vec;
+ slope_vec = (Dtype4*)slope;
+ *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+ Dtype negative_slope;
+#endif
+
+ if (global_y * TILE_M < output_width * output_height )
+ {
+ for (int i = 0; i < 8; i++)
+ {
+ if ( TILE_N_LAST_DIV8 > 0 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out_offset + ( 0+i) * out_pitch_y, blockC[0][i] + SUBGROUP_GET_BIAS(0, i));
+ }
+ if ( TILE_N_LAST_DIV8 > 1 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out_offset + ( 8+i) * out_pitch_y, blockC[1][i] + SUBGROUP_GET_BIAS(1, i));
+ }
+ if ( TILE_N_LAST_DIV8 > 2 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out_offset + (16+i) * out_pitch_y, blockC[2][i] + SUBGROUP_GET_BIAS(2, i));
+ }
+ if ( TILE_N_LAST_DIV8 > 3 )
+ {
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out_offset + (24+i) * out_pitch_y, blockC[3][i] + SUBGROUP_GET_BIAS(3, i));
+ }
+ }
+ }
+ }
+#endif
+}
+#endif
+#ifdef GEMM_LIKE_CONV_32_2
+
+//////////////////////////////////////////////////////////////////////////////
+// Conv_Interleaved_32_2_flex
+//
+// Convolution: each workitem computes 1 patch x 32 filters worth of output
+// data. Kernel's inner loop works on a single tile consisting of one
+// row from each patch and the filter data corresponding to that row. Filter
+// matrix is interleaved to reduce GRF bank conflicts. Patches are walked
+// by rows and then by slices. Relies on sub_group extension for block
+// reads and SIMD broadcast. Allows flexible sizing of TILE width (TILE_N)
+// by dynamically selecting one of two code paths: one uses TILE_N = 32 and
+// the other uses TILE_N = 8, 16, or 24.
+#define TILE_M 2
+#define TILE_K KERNEL_WIDTH
+#define TILE_N 32
+
+#ifndef __BEIGNET__
+__attribute__((intel_reqd_sub_group_size(8)))
+#endif
+__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
+{
+ const int group_x = get_group_id(0);
+ const int group_y = get_group_id(1);
+ const int global_x = get_global_id(0);
+ const int global_y = get_global_id(1);
+ const int global_z = get_global_id(2);
+ int interleaved_y;
+ int kernel_y;
+ int kernel_idx;
+
+#define DOT_PRODUCT_8( _result, _rowA, colB ) \
+ { \
+ _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
+ _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
+ _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
+ _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
+ _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
+ _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
+ _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
+ _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
+ }
+ typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
+
+ // True for all threads if filter_width is multiple of TILE_N
+ // else, true for all but right-most column of threads.
+ if( TILE_N_LAST == 0 || global_x < WIDTH1 / TILE_N )
+ {
+ // Result ctile (*dst) is M rows x N columns
+ // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
+ Dtype8 blockC00 = 0.f;
+ Dtype8 blockC10 = 0.f;
+ Dtype8 blockC20 = 0.f;
+ Dtype8 blockC30 = 0.f;
+ Dtype8 blockC01 = 0.f;
+ Dtype8 blockC11 = 0.f;
+ Dtype8 blockC21 = 0.f;
+ Dtype8 blockC31 = 0.f;
+
+ // Src0 (patch input) is directly used as atile.
+ // Each work item points to the start of a different patch.
+ // atile is M rows x K columns.
+ int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
+ int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
+ int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
+ int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+ int saved_y0 = curr_y0;
+ int saved_y1 = curr_y1;
+#endif
+ const __global Dtype *src0_read0 = src0
+ + aligned_input_size * global_z // batch offset
+ + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
+ + curr_x0 - INPUT_PAD_W; // x offset
+ const __global Dtype *src0_read1 = src0
+ + aligned_input_size * global_z // batch offset
+ + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
+ + curr_x1 - INPUT_PAD_W; // x offset
+
+ // Src1 (filter) is directly used as btile.
+ // It starts at the top of src1 and walks down.
+ // btile is K rows x N columns.
+ const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
+
+ // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+ // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+ // and KERNEL_WIDTH/2 rows of interleaved filter.
+ int patch_depth = 0;
+ do
+ {
+ int patch_row = 0;
+ do
+ {
+ // Load atile and btile.
+ // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype8 granularity.
+ // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
+ // interleaved row is padded with zero to ensure same size as interleaved rows. This
+ // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
+ // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
+ // (0, 0) (8, 0) (16, 0) (24, 0) ... (0, 0) (0, 1) (8, 0) (0, 1) (16, 0) (0, 1) (24, 0) ..
+ // (0, 1) (8, 1) (16, 1) (24, 1) ... => (0, 2) (8, 2) (16, 2) (24, 2) ...
+ // (0, 2) (8, 2) (16, 2) (24, 2) ... ...
+ // ...
+ const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
+ Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;
+ Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+ Dtype* pblockA01 = (Dtype*)(&blockA01);
+#else
+ Dtype_t blockA00;
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+ int pos = 0;
+ LOOP(KERNEL_WIDTH, pos,
+ {
+ if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
+ pblockA00[pos] = src0_read0[pos * DILATION_X];
+ else
+ pblockA00[pos] = 0;
+ })
+ curr_y0 += DILATION_Y;
+ Dtype_t blockA01;
+ Dtype* pblockA01 = (Dtype*)(&blockA01);
+ pos = 0;
+ LOOP(KERNEL_WIDTH, pos,
+ {
+ if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
+ pblockA01[pos] = src0_read1[pos * DILATION_X];
+ else
+ pblockA01[pos] = 0;
+ })
+ curr_y1 += DILATION_Y;
+ src0_read0 += (ROW_PITCH * DILATION_Y);
+ src0_read1 += (ROW_PITCH * DILATION_Y);
+#endif
+ Dtype blockB00[KERNEL_WIDTH*4];
+ Dtype8* p8BlockB00 = (Dtype8*)blockB00;
+ Dtype4* p4BlockB00 = (Dtype4*)blockB00;
+ Dtype* pBlockB00 = (Dtype* )blockB00;
+
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+ p8BlockB00[interleaved_y] = as_Dtype8( SUB_GROUP_BLOCK_READ8( (const __global INT_TYPE*)src1_read ) );
+ src1_read += WIDTH1 * 2;
+ } )
+ if ( kernel_width_is_odd )
+ {
+ p4BlockB00[KERNEL_WIDTH - 1] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+ src1_read += WIDTH1 * 2;
+ }
+ // Perform MADs
+ kernel_idx = 0;
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+ kernel_y = interleaved_y * 2;
+ DOT_PRODUCT_8( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC01, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC01, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC11, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC11, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC20, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC21, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC20, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC21, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC30, pblockA00[kernel_y ], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC31, pblockA01[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC30, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC31, pblockA01[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ } )
+ if ( kernel_width_is_odd )
+ {
+ kernel_y = interleaved_y * 2;
+ DOT_PRODUCT_8( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC01, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC11, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC20, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC21, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC30, pblockA00[kernel_y], pBlockB00[kernel_idx] );
+ DOT_PRODUCT_8( blockC31, pblockA01[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ }
+ }
+
+ //while( ++patch_row < 1 ); //debug
+ while( ++patch_row < KERNEL_HEIGHT );
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
+ curr_y0 = saved_y0;
+ curr_y1 = saved_y1;
+#endif
+ src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+ src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
+ }
+ //while ( ++patch_depth < 1 ); //debug
+ while ( ++patch_depth < INPUT_DEPTH );
+
+ // Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
+ // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
+ int out0_offset = global_z * out_pitch_z // batch offset
+ + ( group_x * TILE_N ) * out_pitch_y // channel offset
+ + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ int out1_offset = global_z * out_pitch_z // batch offset
+ + ( group_x * TILE_N ) * out_pitch_y // channel offset
+ + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
+
+#if APPLY_BIAS
+ Dtype bias[4];
+ Dtype4 *bias_vec;
+ bias_vec = (Dtype4*)bias;
+ *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+ Dtype slope[4];
+ Dtype4 *slope_vec;
+ slope_vec = (Dtype4*)slope;
+ *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+ Dtype negative_slope;
+#endif
+
+ if( global_y * TILE_M < output_width * output_height )
+ {
+ for( int i = 0; i < 8; i++ )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC00[i] + SUBGROUP_GET_BIAS(0, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC10[i] + SUBGROUP_GET_BIAS(1, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC20[i] + SUBGROUP_GET_BIAS(2, i));
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC30[i] + SUBGROUP_GET_BIAS(3, i));
+ }
+ }
+ if( global_y * TILE_M + 1 < output_width * output_height )
+ {
+ for( int i = 0; i < 8; i++ )
+ {
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC01[i] + SUBGROUP_GET_BIAS(0, i));
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC11[i] + SUBGROUP_GET_BIAS(1, i));
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC21[i] + SUBGROUP_GET_BIAS(2, i));
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC31[i] + SUBGROUP_GET_BIAS(3, i));
+ }
+ }
+ }
+#if TILE_N_LAST > 0
+ else
+ {
+
+ // Result ctile (*dst) is M rows x N columns
+ // LWG size is 1x8. Thus each thread calculates 8*M rows x N cols of ctile.
+ int i = 0;
+ Dtype8 blockC0[TILE_N_LAST_DIV8];
+ Dtype8 blockC1[TILE_N_LAST_DIV8];
+ LOOP(TILE_N_LAST_DIV8, i,
+ {
+ blockC0[i] = 0.f;
+ blockC1[i] = 0.f;
+ } )
+
+ // Src0 (patch input) is directly used as atile.
+ // Each work item points to the start of a different patch.
+ // atile is M rows x K columns.
+ int curr_x0 = ( ( global_y * TILE_M + 0 ) % output_width ) * STRIDE_X;
+ int curr_x1 = ( ( global_y * TILE_M + 1 ) % output_width ) * STRIDE_X;
+ int curr_y0 = ( ( global_y * TILE_M + 0 ) / output_width ) * STRIDE_Y;
+ int curr_y1 = ( ( global_y * TILE_M + 1 ) / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+ int saved_y0 = curr_y0;
+ int saved_y1 = curr_y1;
+#endif
+ const __global Dtype *src0_read0 = src0
+ + aligned_input_size * global_z // batch offset
+ + (curr_y0 - INPUT_PAD_H) * ROW_PITCH // y offset
+ + curr_x0 - INPUT_PAD_W; // x offset
+ const __global Dtype *src0_read1 = src0
+ + aligned_input_size * global_z // batch offset
+ + (curr_y1 - INPUT_PAD_H) * ROW_PITCH // y offset
+ + curr_x1 - INPUT_PAD_W; // x offset
+
+ // Src1 (filter) is directly used as btile.
+ // It starts at the top of src1 and walks down.
+ // btile is K rows x N columns.
+ const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2);
+
+ // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+ // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+ // and KERNEL_WIDTH/2 rows of interleaved filter.
+ int patch_depth = 0;
+ do
+ {
+ int patch_row = 0;
+ do
+ {
+ // Load atile and interleaved btile.
+ const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+#if INPUT_PAD_H == 0 && INPUT_PAD_W == 0 && DILATION_X == 1 && DILATION_Y == 1
+ Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read0 )[ 0 ]; src0_read0 += ROW_PITCH;
+ Dtype_t blockA01 = ( (const __global Dtype_t*)src0_read1 )[ 0 ]; src0_read1 += ROW_PITCH;
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+ Dtype* pblockA01 = (Dtype*)(&blockA01);
+#else
+ Dtype_t blockA00;
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+ int pos = 0;
+ LOOP(KERNEL_WIDTH, pos,
+ {
+ if (curr_y0 >= INPUT_PAD_H && curr_y0 < input_height + INPUT_PAD_H && curr_x0 + pos * DILATION_X >= INPUT_PAD_W && curr_x0 + pos * DILATION_X < input_width + INPUT_PAD_W)
+ pblockA00[pos] = src0_read0[pos * DILATION_X];
+ else
+ pblockA00[pos] = 0;
+ })
+ curr_y0 += DILATION_Y;
+ Dtype_t blockA01;
+ Dtype* pblockA01 = (Dtype*)(&blockA01);
+ pos = 0;
+ LOOP(KERNEL_WIDTH, pos,
+ {
+ if (curr_y1 >= INPUT_PAD_H && curr_y1 < input_height + INPUT_PAD_H && curr_x1 + pos * DILATION_X >= INPUT_PAD_W && curr_x1 + pos * DILATION_X < input_width + INPUT_PAD_W)
+ pblockA01[pos] = src0_read1[pos * DILATION_X];
+ else
+ pblockA01[pos] = 0;
+ })
+ curr_y1 += DILATION_Y;
+ src0_read0 += (ROW_PITCH * DILATION_Y);
+ src0_read1 += (ROW_PITCH * DILATION_Y);
+#endif
+ Dtype blockB[KERNEL_WIDTH * TILE_N_LAST_DIV8];
+
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+#if TILE_N_LAST_DIV8 == 1
+ Dtype2* p2BlockB = (Dtype2* )blockB;
+ p2BlockB[interleaved_y] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 2
+ Dtype4* p4BlockB = (Dtype4* )blockB;
+ p4BlockB[interleaved_y] = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 3
+ //TODO: broken. No block_read6
+ Dtype6* p6BlockB = (Dtype6* )blockB;
+ (*((Dtype8*)(&p6BlockB[interleaved_y]))).s0123 = as_Dtype4( SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read ) );
+ (*((Dtype8*)(&p6BlockB[interleaved_y]))).s45 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)(src1_read + 4 * 8) ) );
+#endif
+ src1_read += WIDTH1 * 2;
+ } )
+ if ( kernel_width_is_odd )
+ {
+#if TILE_N_LAST_DIV8 == 1
+ Dtype* pBlockB = (Dtype* )blockB;
+ pBlockB[KERNEL_WIDTH - 1] = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 2
+ Dtype2* p2BlockB = (Dtype2* )blockB;
+ p2BlockB[KERNEL_WIDTH - 1] = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+#elif TILE_N_LAST_DIV8 == 3
+ Dtype3* p3BlockB = (Dtype3* )blockB;
+ p3BlockB[KERNEL_WIDTH - 1].s01 = as_Dtype2( SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read ) );
+ p3BlockB[KERNEL_WIDTH - 1].s2 = as_Dtype( SUB_GROUP_BLOCK_READ( (const __global INT_TYPE*) (src1_read + 8) ) );
+#endif
+ src1_read += WIDTH1 * 2;
+ }
+
+ // Perform MADs
+ Dtype* pBlockB = (Dtype*)blockB;
+ kernel_idx = 0;
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+ kernel_y = interleaved_y * 2;
+ DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y ], pBlockB[kernel_idx] );
+ DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
+ DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 2
+ DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y ], pBlockB[kernel_idx] );
+ DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
+ DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 3
+ DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y ], pBlockB[kernel_idx] );
+ DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y ], pBlockB[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y + 1], pBlockB[kernel_idx] );
+ DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y + 1], pBlockB[kernel_idx] ); kernel_idx++;
+#endif
+#endif
+ } )
+ kernel_y = interleaved_y * 2;
+ if ( kernel_width_is_odd )
+ {
+ DOT_PRODUCT_8( blockC0[0], pblockA00[kernel_y], pBlockB[kernel_idx] );
+ DOT_PRODUCT_8( blockC1[0], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 2
+ DOT_PRODUCT_8( blockC0[1], pblockA00[kernel_y], pBlockB[kernel_idx] );
+ DOT_PRODUCT_8( blockC1[1], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#if TILE_N_LAST_DIV8 >= 3
+ DOT_PRODUCT_8( blockC0[2], pblockA00[kernel_y], pBlockB[kernel_idx] );
+ DOT_PRODUCT_8( blockC1[2], pblockA01[kernel_y], pBlockB[kernel_idx] ); kernel_idx++;
+#endif
+#endif
+ }
+ }
+
+ //while( ++patch_row < 1 ); //debug
+ while( ++patch_row < KERNEL_HEIGHT );
+#if INPUT_PAD_W != 0 || INPUT_PAD_H != 0 || DILATION_X != 1 || DILATION_Y != 1
+ curr_y0 = saved_y0;
+ curr_y1 = saved_y1;
+#endif
+ src0_read0 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+ src0_read1 += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y );
+ }
+ //while ( ++patch_depth < 1 ); //debug
+ while ( ++patch_depth < INPUT_DEPTH );
+
+ // Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
+ // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
+ int out0_offset = global_z * out_pitch_z // batch offset
+ + ( group_x * TILE_N ) * out_pitch_y // channel offset
+ + ( ( global_y * TILE_M + 0 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ + ( ( global_y * TILE_M + 0 ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ int out1_offset = global_z * out_pitch_z // batch offset
+ + ( group_x * TILE_N ) * out_pitch_y // channel offset
+ + ( ( global_y * TILE_M + 1 ) / output_width + OUT_PADDING_HEIGHT ) * OUT_PITCH_X // y offset
+ + ( ( global_y * TILE_M + 1 ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ __global Dtype *out1 = dst + out1_offset;
+
+#if APPLY_BIAS
+ Dtype bias[4];
+ Dtype4 *bias_vec;
+ bias_vec = (Dtype4*)bias;
+ *bias_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+#ifdef FUSED_CONV_CHANNEL_RELU
+ Dtype slope[4];
+ Dtype4 *slope_vec;
+ slope_vec = (Dtype4*)slope;
+ *slope_vec = as_Dtype4(SUB_GROUP_BLOCK_READ4((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+ Dtype negative_slope;
+#endif
+ if( global_y * TILE_M < output_width * output_height )
+ {
+ for( int i = 0; i < 8; i++ )
+ {
+ if ( TILE_N_LAST_DIV8 > 0 )
+ {
+
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out0_offset + ( 0+i) * out_pitch_y, blockC0[0][i] + SUBGROUP_GET_BIAS(0, i));
+ }
+ if ( TILE_N_LAST_DIV8 > 1 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out0_offset + ( 8+i) * out_pitch_y, blockC0[1][i] + SUBGROUP_GET_BIAS(1, i));
+ }
+ if ( TILE_N_LAST_DIV8 > 2 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out0_offset + (16+i) * out_pitch_y, blockC0[2][i] + SUBGROUP_GET_BIAS(2, i));
+ }
+ if ( TILE_N_LAST_DIV8 > 3 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out0_offset + (24+i) * out_pitch_y, blockC0[3][i] + SUBGROUP_GET_BIAS(3, i));
+ }
+ }
+ }
+ if( global_y * TILE_M + 1 < output_width * output_height )
+ {
+ for( int i = 0; i < 8; i++ )
+ {
+ if ( TILE_N_LAST_DIV8 > 0 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[0], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out1_offset + ( 0+i) * out_pitch_y, blockC1[0][i] + SUBGROUP_GET_BIAS(0, i));
+ }
+ if ( TILE_N_LAST_DIV8 > 1 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[1], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out1_offset + ( 8+i) * out_pitch_y, blockC1[1][i] + SUBGROUP_GET_BIAS(1, i));
+ }
+ if ( TILE_N_LAST_DIV8 > 2 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[2], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out1_offset + (16+i) * out_pitch_y, blockC1[2][i] + SUBGROUP_GET_BIAS(2, i));
+ }
+ if ( TILE_N_LAST_DIV8 > 3 )
+ {
+#ifdef FUSED_CONV_CHANNEL_RELU
+ negative_slope = intel_sub_group_shuffle(slope[3], i);
+#endif
+ ACTIVATION_FUNCTION(dst, out1_offset + (24+i) * out_pitch_y, blockC1[3][i] + SUBGROUP_GET_BIAS(3, i));
+ }
+ }
+ }
+ }
+#endif
+}
+#endif
+
+#if defined(GEMM_LIKE_CONV_32_2_SIMD16) || defined(GEMM_LIKE_CONV_32_1_SIMD16)
+#ifdef FUSED_CONV_CHANNEL_RELU
+#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_, _m_) do {\
+ if (global_y * TILE_M < output_width * output_height ) \
+ { \
+ if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\
+ for (int i = 0; i < 16; i++) \
+ { \
+ negative_slope = intel_sub_group_shuffle(slope[0], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+ negative_slope = intel_sub_group_shuffle(slope[1], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \
+ } \
+ } \
+ else if( ( OUT_DEPTH % 16 ) == 0 ) { \
+ if ( ( global_x + 1 ) < get_global_size(0) ) { \
+ for ( int i = 0; i < 16; i++ ) \
+ { \
+ negative_slope = intel_sub_group_shuffle(slope[0], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+ negative_slope = intel_sub_group_shuffle(slope[1], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \
+ } \
+ } \
+ else { \
+ for (int i = 0; i < 16; i++) \
+ { \
+ negative_slope = intel_sub_group_shuffle(slope[0], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+ } \
+ } \
+ } \
+ else { \
+ if ( ( global_x + 1 ) < get_global_size(0) ) \
+ { \
+ for ( int i = 0; i < 16; i++ ) \
+ { \
+ negative_slope = intel_sub_group_shuffle(slope[0], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+ negative_slope = intel_sub_group_shuffle(slope[1], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \
+ } \
+ } \
+ else { \
+ if ( (OUT_DEPTH % TILE_N) > 16 ) { \
+ for (int i = 0; i < 16 ; i++) \
+ { \
+ negative_slope = intel_sub_group_shuffle(slope[0], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+ } \
+ for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
+ { \
+ negative_slope = intel_sub_group_shuffle(slope[1], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \
+ } \
+ } \
+ else { \
+ for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
+ { \
+ negative_slope = intel_sub_group_shuffle(slope[0], i); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+ } \
+ } \
+ } \
+ } \
+ } \
+ }while(0)
+#else
+#define INTERLEAVED_SIMD16_OUTPUT(_out_, _offset_, _m_) do {\
+ if (global_y * TILE_M < output_width * output_height ) \
+ { \
+ if ( ( OUT_DEPTH % TILE_N ) == 0 ) {\
+ for (int i = 0; i < 16; i++) \
+ { \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \
+ } \
+ } \
+ else if( ( OUT_DEPTH % 16 ) == 0 ) { \
+ if ( ( global_x + 1 ) < get_global_size(0) ) { \
+ for ( int i = 0; i < 16; i++ ) \
+ { \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_ [i] + SUBGROUP_GET_BIAS(1, i)); \
+ } \
+ } \
+ else { \
+ for (int i = 0; i < 16; i++) \
+ { \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_ [i] + SUBGROUP_GET_BIAS(0, i)); \
+ } \
+ } \
+ } \
+ else { \
+ if ( ( global_x + 1 ) < get_global_size(0) ) \
+ { \
+ for ( int i = 0; i < 16; i++ ) \
+ { \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+ ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \
+ } \
+ } \
+ else { \
+ if ( (OUT_DEPTH % TILE_N) > 16 ) { \
+ for (int i = 0; i < 16 ; i++) \
+ { \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+ } \
+ for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
+ { \
+ ACTIVATION_FUNCTION(_out_, _offset_ + (16+i) * out_pitch_y, blockC1 ##_m_[i] + SUBGROUP_GET_BIAS(1, i)); \
+ } \
+ } \
+ else { \
+ for (int i = 0; i < OUT_DEPTH % 16 ; i++) \
+ { \
+ ACTIVATION_FUNCTION(_out_, _offset_ + ( 0+i) * out_pitch_y, blockC0 ##_m_[i] + SUBGROUP_GET_BIAS(0, i)); \
+ } \
+ } \
+ } \
+ } \
+ } \
+ }while(0)
+#endif
+#endif
+
+#ifdef GEMM_LIKE_CONV_32_1_SIMD16
+#define TILE_M 1
+#define TILE_K KERNEL_WIDTH
+#define TILE_N 32
+
+#ifndef __BEIGNET__
+__attribute__((intel_reqd_sub_group_size(16)))
+#endif
+__kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
+{
+ const int group_x = get_group_id(0);
+ const int group_y = get_group_id(1);
+ const int global_x = get_global_id(0);
+ const int global_y = get_global_id(1);
+ const int global_z = get_global_id(2);
+ int interleaved_y;
+ int kernel_y;
+ int kernel_idx;
+
+ // Result ctile (*dst) is M rows x N columns
+ // LWG size is 1x16. Thus each thread calculates 16*M rows x N cols of ctile.
+ Dtype16 blockC00 = 0.f;
+ Dtype16 blockC10 = 0.f;
+
+ // Src0 (patch input) is directly used as atile.
+ // Each work item points to the start of a different patch.
+ // atile is M rows x K columns.
+ int curr_x = ( global_y % output_width ) * STRIDE_X;
+ int curr_y = ( global_y / output_width ) * STRIDE_Y;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+ int saved_y = curr_y;
+#endif
+ const __global Dtype *src0_read = src0
+ + aligned_input_size * global_z // batch offset
+ + (curr_y - INPUT_PAD_H) * ROW_PITCH // y offset
+ + curr_x - INPUT_PAD_W; // x offset
+ const __global Dtype *src0_read_orig = src0_read;
+
+ // Src1 (filter) is directly used as btile.
+ // It starts at the top of src1 and walks down.
+ // btile is K rows x N columns.
+ const __global Dtype *src1_read = src1 + ( global_x * TILE_N * 2 );
+
+#define DOT_PRODUCT_16( _result, _rowA, colB ) \
+ { \
+ _result.s0 = mad( _rowA, sub_group_broadcast( colB, 0 ), _result.s0 ); \
+ _result.s1 = mad( _rowA, sub_group_broadcast( colB, 1 ), _result.s1 ); \
+ _result.s2 = mad( _rowA, sub_group_broadcast( colB, 2 ), _result.s2 ); \
+ _result.s3 = mad( _rowA, sub_group_broadcast( colB, 3 ), _result.s3 ); \
+ _result.s4 = mad( _rowA, sub_group_broadcast( colB, 4 ), _result.s4 ); \
+ _result.s5 = mad( _rowA, sub_group_broadcast( colB, 5 ), _result.s5 ); \
+ _result.s6 = mad( _rowA, sub_group_broadcast( colB, 6 ), _result.s6 ); \
+ _result.s7 = mad( _rowA, sub_group_broadcast( colB, 7 ), _result.s7 ); \
+ _result.s8 = mad( _rowA, sub_group_broadcast( colB, 8 ), _result.s8 ); \
+ _result.s9 = mad( _rowA, sub_group_broadcast( colB, 9 ), _result.s9 ); \
+ _result.sa = mad( _rowA, sub_group_broadcast( colB, 10 ), _result.sa ); \
+ _result.sb = mad( _rowA, sub_group_broadcast( colB, 11 ), _result.sb ); \
+ _result.sc = mad( _rowA, sub_group_broadcast( colB, 12 ), _result.sc ); \
+ _result.sd = mad( _rowA, sub_group_broadcast( colB, 13 ), _result.sd ); \
+ _result.se = mad( _rowA, sub_group_broadcast( colB, 14 ), _result.se ); \
+ _result.sf = mad( _rowA, sub_group_broadcast( colB, 15 ), _result.sf ); \
+ }
+ typedef CAT( Dtype, KERNEL_WIDTH ) Dtype_t;
+ // Walk DOWN src0 (patch 0, 1, 2, ...) and DOWN src1.
+ // Inner loop loads and FMADs one row (KERNEL_WIDTH) of each input patch
+ // and KERNEL_WIDTH/2 rows of interleaved filter.
+ int patch_depth = 0;
+#ifndef __BEIGNET__
+ __attribute__((opencl_unroll_hint(1)))
+#endif
+ do
+ {
+ int patch_row = 0;
+#if INPUT_PAD_H != 0 || INPUT_PAD_W != 0 || DILATION_X != 1 || DILATION_Y != 1
+ curr_y = saved_y;
+#endif
+#ifndef __BEIGNET__
+ __attribute__((opencl_unroll_hint(1)))
+#endif
+ do
+ {
+ // Load atile and btile.
+ // Kernel data is partially interleaved. Every 2 rows are interleaved at Dtype16 granularity.
+ // The exception is that if KERNEL_WIDTH is odd the last row is not interleaved. The non
+ // interleaved row is padded with zero to ensure same size as interleaved rows. This
+ // interleaving is done to ensure 0% GDR bank conflicts. For example, this is how the
+ // kernel data would be arranged before/after interleaving for KERNEL_WIDTH=3.
+ // (0, 0) (16, 0) (32, 0) (48, 0) ... (0, 0) ( 0, 1) (16, 0) ( 0, 1) (32, 0) (0, 1) (48, 0) ...
+ // (0, 1) (16, 1) (32, 1) (48, 1) ... => (0, 2) (16, 2) (32, 2) (48, 2) ...
+ // (0, 2) (16, 2) (32, 2) (48, 2) ... ...
+ // ...
+ const bool kernel_width_is_odd = KERNEL_WIDTH % 2 == 1;
+
+#if INPUT_PAD_W == 0 && INPUT_PAD_H == 0 && DILATION_X == 1 && DILATION_Y == 1
+ Dtype_t blockA00 = ( (const __global Dtype_t*)src0_read )[ 0 ];
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+#else
+ Dtype_t blockA00;
+ Dtype* pblockA00 = (Dtype*)(&blockA00);
+ int pos = 0;
+ LOOP(KERNEL_WIDTH, pos,
+ {
+ if (curr_y >= INPUT_PAD_H && curr_y < input_height + INPUT_PAD_H && curr_x + pos * DILATION_X >= INPUT_PAD_W && curr_x + pos * DILATION_X < input_width + INPUT_PAD_W)
+ pblockA00[pos] = src0_read[pos * DILATION_X];
+ else
+ pblockA00[pos] = 0;
+ })
+ curr_y += DILATION_Y;
+#endif
+ src0_read += ROW_PITCH * DILATION_Y;
+ INT_TYPE blockB00[KERNEL_WIDTH * 2];
+ INT_TYPE4* p4BlockB00 = (INT_TYPE4*)blockB00;
+ INT_TYPE2* p2BlockB00 = (INT_TYPE2*)blockB00;
+ Dtype* pBlockB00 = (Dtype*)blockB00;
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+ p4BlockB00[interleaved_y] = SUB_GROUP_BLOCK_READ4( (const __global INT_TYPE*)src1_read );
+ src1_read += WIDTH1 * 2;
+ } )
+ if ( kernel_width_is_odd )
+ {
+ p2BlockB00[KERNEL_WIDTH - 1] = SUB_GROUP_BLOCK_READ2( (const __global INT_TYPE*)src1_read );
+ src1_read += WIDTH1 * 2;
+ }
+
+ // Perform MADs
+ kernel_idx = 0;
+ interleaved_y = 0;
+ LOOP(KERNEL_WIDTH_DIV2, interleaved_y,
+ {
+ kernel_y = interleaved_y * 2;
+ DOT_PRODUCT_16( blockC00, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_16( blockC00, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_16( blockC10, pblockA00[kernel_y ], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_16( blockC10, pblockA00[kernel_y + 1], pBlockB00[kernel_idx] ); kernel_idx++;
+ } )
+ if ( kernel_width_is_odd )
+ {
+ kernel_y = interleaved_y * 2;
+ DOT_PRODUCT_16( blockC00, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ DOT_PRODUCT_16( blockC10, pblockA00[kernel_y], pBlockB00[kernel_idx] ); kernel_idx++;
+ }
+ }
+
+ //while( ++patch_row < 1 ); //debug
+ while( ++patch_row < KERNEL_HEIGHT );
+
+ src0_read += slice_pitch - ( KERNEL_HEIGHT * ROW_PITCH * DILATION_Y ); // reset to start of next slice of patch
+ }
+ //while ( ++patch_depth < 1 ); //debug
+ while ( ++patch_depth < INPUT_DEPTH );
+
+ // Dst resembles a cube of width x height x (output channel * batches). Each tile writes:
+ // (SIMD * TILE_M) x 1 x TILE_N. Partial writes most likely generated if padding used.
+ int out_offset = global_z * out_pitch_z // batch offset
+ + ( group_x * TILE_N ) * out_pitch_y // channel offset
+ + ( ( global_y * TILE_M ) / output_width + OUT_PADDING_HEIGHT) * OUT_PITCH_X // y offset
+ + ( ( global_y * TILE_M ) % output_width ) + OUT_PADDING_LEFT; // x offset
+ __global Dtype *out = dst + out_offset;
+
+#if APPLY_BIAS
+ Dtype bias[2];
+ Dtype2 *bias_vec;
+ bias_vec = (Dtype2*)bias;
+ *bias_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)biases_base + group_x * TILE_N));
+#endif
+#ifdef FUSED_CONV_CHANNEL_RELU
+ Dtype slope[2];
+ Dtype2 *slope_vec;
+ slope_vec = (Dtype2*)slope;
+ *slope_vec = as_Dtype2(SUB_GROUP_BLOCK_READ2((__global INT_TYPE *)negative_slope_base + group_x * TILE_N));
+ Dtype negative_slope;
+#endif
+
+ INTERLEAVED_SIMD16_OUTPUT(dst, out_offset, 0);
+}
+#endif
+#endif // KERNEL_BASIC/IDLF/GEMM_LIKE
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
+ (__global Dtype* weightIn,
+ __global Dtype* weightOut,
+ const int kernel_w,
+ const int kernel_h,
+ const int channels,
+ const int outputs,
+ const int swizzleFactor) {
+
+ unsigned int sX = get_global_id(0);
+
+ //Original location
+
+ //Output location
+ int outputSublayer = channels / swizzleFactor;
+ int outputSublayerIndex = channels % swizzleFactor;
+
+ int filter = sX / (kernel_w*kernel_h*channels);
+ int kernel_X = sX % kernel_w;
+ int kernel_Y = (sX / kernel_w) % kernel_h;
+ int kernel_C = (sX / (kernel_w * kernel_h)) % channels;
+
+ int FP = filter / swizzleFactor;
+ int F1 = filter % swizzleFactor;
+
+ weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
+ = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void dummy_kernel()
+{
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+
+// Types used for parameters, offset computations and so on
+#define int_tp int
+#define uint_tp unsigned int
+
+#define Dtype float
+#define Dtype2 float2
+#define Dtype4 float4
+#define Dtype8 float8
+
+#define as_Dtype as_float
+#define as_Dtype2 as_float2
+#define as_Dtype4 as_float4
+#define as_Dtype8 as_float8
+
+#define KERNEL_ARG_DTYPE float
+
+#if defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#endif
+
+#define TILE_M 32
+#define TILE_K 8
+
+// common block to calculate (alpha * AxB + beta * C) and output to destination image.
+
+#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )
+#define SHUFFLE_TYPE2(val) val
+#define SHUFFLE_TYPE8(val) val
+#define READ_IMAGE(__image, __coord) read_imagef(__image, sampler, __coord)
+#define SIZE_OF_ELEMENT sizeof(uint)
+#define SIMD_SIZE_GEMM 8
+#define TILE_N 8
+
+//#define USE_IMAGE_C
+#ifdef USE_IMAGE_C
+#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )
+#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )
+#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
+#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))
+#else
+#define BLOCKC_READ8( _C, _coordC ) \
+ (Dtype8) ( (_coordC.x + get_local_id(0) < N && _coordC.y < M) ? _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] : 0, \
+ (_coordC.x + get_local_id(0) < N && _coordC.y + 1 < M) ? _C[ ( _coordC.y + 1 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+ (_coordC.x + get_local_id(0) < N && _coordC.y + 2 < M) ? _C[ ( _coordC.y + 2 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+ (_coordC.x + get_local_id(0) < N && _coordC.y + 3 < M) ? _C[ ( _coordC.y + 3 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+ (_coordC.x + get_local_id(0) < N && _coordC.y + 4 < M) ? _C[ ( _coordC.y + 4 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+ (_coordC.x + get_local_id(0) < N && _coordC.y + 5 < M) ? _C[ ( _coordC.y + 5 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+ (_coordC.x + get_local_id(0) < N && _coordC.y + 6 < M) ? _C[ ( _coordC.y + 6 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+ (_coordC.x + get_local_id(0) < N && _coordC.y + 7 < M) ? _C[ ( _coordC.y + 7 ) * ldc + _coordC.x + get_local_id(0) ] : 0)
+
+#define BLOCKC_WRITE8( _C, _coordC, _val) do {\
+ if (_coordC.x + get_local_id(0) < N) { \
+ if (_coordC.y < M) \
+ _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] = _val.s0; \
+ if (_coordC.y + 1 < M) \
+ _C[ ( _coordC.y + 1 )* ldc + _coordC.x + get_local_id(0) ] = _val.s1; \
+ if (_coordC.y + 2 < M) \
+ _C[ ( _coordC.y + 2 )* ldc + _coordC.x + get_local_id(0) ] = _val.s2; \
+ if (_coordC.y + 3 < M) \
+ _C[ ( _coordC.y + 3 )* ldc + _coordC.x + get_local_id(0) ] = _val.s3; \
+ if (_coordC.y + 4 < M) \
+ _C[ ( _coordC.y + 4 )* ldc + _coordC.x + get_local_id(0) ] = _val.s4; \
+ if (_coordC.y + 5 < M) \
+ _C[ ( _coordC.y + 5 )* ldc + _coordC.x + get_local_id(0) ] = _val.s5; \
+ if (_coordC.y + 6 < M) \
+ _C[ ( _coordC.y + 6 )* ldc + _coordC.x + get_local_id(0) ] = _val.s6; \
+ if (_coordC.y + 7 < M) \
+ _C[ ( _coordC.y + 7 )* ldc + _coordC.x + get_local_id(0) ] = _val.s7; \
+ }} while(0)
+#define MATC_PARAMETER __global Dtype * C, const int offC, const int M, const int N, const int ldc
+#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, (C + offC), (C + offC), 1)
+#endif
+
+#define GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, _C, _dst, _C_step) \
+ int2 coordDst = (int2)( ( group_x * TILE_N ) * _C_step, ( group_y * TILE_M ) ); \
+ int2 coordC = coordDst; \
+ Dtype8 blockC00; \
+ Dtype8 blockC01; \
+ Dtype8 blockC02; \
+ Dtype8 blockC03; \
+ if (BETA_NOT0) { \
+ blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
+ blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
+ blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
+ blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \
+ if (!ALPHA1) { \
+ blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
+ blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
+ blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \
+ blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \
+ } else { \
+ blockC00 += blockAxB00; \
+ blockC01 += blockAxB01; \
+ blockC02 += blockAxB02; \
+ blockC03 += blockAxB03; \
+ } \
+ } else { \
+ blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
+ blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
+ blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
+ blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \
+ if (!ALPHA1) { \
+ blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
+ blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
+ blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \
+ blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \
+ } else { \
+ blockC00 += blockAxB00; \
+ blockC01 += blockAxB01; \
+ blockC02 += blockAxB02; \
+ blockC03 += blockAxB03; \
+ } \
+ } \
+ BLOCKC_WRITE8( _dst, coordDst, blockC00 ); coordDst.y += 8; \
+ BLOCKC_WRITE8( _dst, coordDst, blockC01 ); coordDst.y += 8; \
+ BLOCKC_WRITE8( _dst, coordDst, blockC02 ); coordDst.y += 8; \
+ BLOCKC_WRITE8( _dst, coordDst, blockC03 );
+
+// Get the specified column of the block of the block
+#define TRANSPOSE_BLOCK_8( _block, _col ) \
+ (Dtype8)( intel_sub_group_shuffle( _block.s0, _col ), \
+ intel_sub_group_shuffle( _block.s1, _col ), \
+ intel_sub_group_shuffle( _block.s2, _col ), \
+ intel_sub_group_shuffle( _block.s3, _col ), \
+ intel_sub_group_shuffle( _block.s4, _col ), \
+ intel_sub_group_shuffle( _block.s5, _col ), \
+ intel_sub_group_shuffle( _block.s6, _col ), \
+ intel_sub_group_shuffle( _block.s7, _col ) );
+
+// A's column block multiply B 's row block.
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
+ { \
+ const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
+ const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
+ const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
+ const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
+ const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
+ const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
+ const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
+ const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
+ _result = mad( (Dtype8)(_blockB.s0), acol0, _result ); \
+ _result = mad( (Dtype8)(_blockB.s1), acol1, _result ); \
+ _result = mad( (Dtype8)(_blockB.s2), acol2, _result ); \
+ _result = mad( (Dtype8)(_blockB.s3), acol3, _result ); \
+ _result = mad( (Dtype8)(_blockB.s4), acol4, _result ); \
+ _result = mad( (Dtype8)(_blockB.s5), acol5, _result ); \
+ _result = mad( (Dtype8)(_blockB.s6), acol6, _result ); \
+ _result = mad( (Dtype8)(_blockB.s7), acol7, _result ); \
+ }
+
+#define GEMM_NN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+ __read_only image2d_t A, \
+ __read_only image2d_t B, \
+ MATC_PARAMETER, \
+ KERNEL_ARG_DTYPE alpha_in, \
+ KERNEL_ARG_DTYPE beta_in, \
+ int width0, \
+ int isFirstColBlock) \
+{ \
+ const Dtype alpha = (Dtype)alpha_in; \
+ const Dtype beta = (Dtype)beta_in; \
+ const int group_x = get_group_id(0); \
+ const int group_y = get_group_id(1); \
+ Dtype8 blockAxB00 = 0.0f; \
+ Dtype8 blockAxB01 = 0.0f; \
+ Dtype8 blockAxB02 = 0.0f; \
+ Dtype8 blockAxB03 = 0.0f; \
+ int2 coordA = (int2)( 0, group_y * TILE_M ); \
+ int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
+ do \
+ { \
+ int2 coordBTemp = coordB; \
+ Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
+ int2 coordATemp = coordA; \
+ Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
+ Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
+ Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
+ Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT; \
+ MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
+ } \
+ while( coordB.y < width0 ); \
+ GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+
+GEMM_NN(1, 0) // ALPHA == 1, BETA == 0
+GEMM_NN(1, 1) // ALPHA == 1, BETA != 0
+GEMM_NN(0, 0) // ALPHA != 1, BETA == 0
+GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
+
+#undef TRANSPOSE_BLOCK_8
+#undef MULTIPLY_BLOCKS_8x8
+#undef GEMM_NN
+
+// replicate the first row to column block.
+#define TRANSPOSE_BLOCK_8(_vec, _col) \
+ (Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \
+ intel_sub_group_shuffle(_vec, _col + 1), \
+ intel_sub_group_shuffle(_vec, _col + 2), \
+ intel_sub_group_shuffle(_vec, _col + 3), \
+ intel_sub_group_shuffle(_vec, _col + 4), \
+ intel_sub_group_shuffle(_vec, _col + 5), \
+ intel_sub_group_shuffle(_vec, _col + 6), \
+ intel_sub_group_shuffle(_vec, _col + 7) )
+
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col ) \
+ { \
+ _result = mad( (Dtype8)(_blockB.s0), TRANSPOSE_BLOCK_8(_blockA.s0, _col), _result ); \
+ _result = mad( (Dtype8)(_blockB.s1), TRANSPOSE_BLOCK_8(_blockA.s1, _col), _result ); \
+ _result = mad( (Dtype8)(_blockB.s2), TRANSPOSE_BLOCK_8(_blockA.s2, _col), _result ); \
+ _result = mad( (Dtype8)(_blockB.s3), TRANSPOSE_BLOCK_8(_blockA.s3, _col), _result ); \
+ _result = mad( (Dtype8)(_blockB.s4), TRANSPOSE_BLOCK_8(_blockA.s4, _col), _result ); \
+ _result = mad( (Dtype8)(_blockB.s5), TRANSPOSE_BLOCK_8(_blockA.s5, _col), _result ); \
+ _result = mad( (Dtype8)(_blockB.s6), TRANSPOSE_BLOCK_8(_blockA.s6, _col), _result ); \
+ _result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result ); \
+ }
+
+#define GEMM_TN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+ __read_only image2d_t A, \
+ __read_only image2d_t B, \
+ MATC_PARAMETER, \
+ KERNEL_ARG_DTYPE alpha_in, \
+ KERNEL_ARG_DTYPE beta_in, \
+ int width0, \
+ int isFirstColBlock) \
+{ \
+ const Dtype alpha = (Dtype)alpha_in; \
+ const Dtype beta = (Dtype)beta_in; \
+ const int group_x = get_group_id(0);\
+ const int group_y = get_group_id(1);\
+ Dtype8 blockAxB00 = 0.0f;\
+ Dtype8 blockAxB01 = 0.0f;\
+ Dtype8 blockAxB02 = 0.0f;\
+ Dtype8 blockAxB03 = 0.0f;\
+ int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
+ int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
+ do\
+ {\
+ int2 coordBTemp = coordB;\
+ Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K;\
+ int2 coordATemp = coordA;\
+ Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\
+ Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\
+ Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\
+ Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
+ MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, 0 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, 0 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, 0 ); \
+ } \
+ while( coordB.y < width0 ); \
+ GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+
+GEMM_TN(1, 0) // ALPHA == 1, BETA == 0
+GEMM_TN(1, 1) // ALPHA == 1, BETA != 0
+GEMM_TN(0, 0) // ALPHA != 1, BETA == 0
+GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
+
+#undef MULTIPLY_BLOCKS_8x8
+#undef TRANSPOSE_BLOCK_8
+#undef GEMM_TN
+
+// The same as GEMM_NN
+#define TRANSPOSE_BLOCK_8( _block, _col ) \
+ (Dtype8)( intel_sub_group_shuffle( _block.s0, _col), \
+ intel_sub_group_shuffle( _block.s1, _col), \
+ intel_sub_group_shuffle( _block.s2, _col), \
+ intel_sub_group_shuffle( _block.s3, _col), \
+ intel_sub_group_shuffle( _block.s4, _col), \
+ intel_sub_group_shuffle( _block.s5, _col), \
+ intel_sub_group_shuffle( _block.s6, _col), \
+ intel_sub_group_shuffle( _block.s7, _col) )
+
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
+ { \
+ const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
+ const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
+ const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
+ const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
+ const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
+ const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
+ const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
+ const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
+ _result = mad( (Dtype8)_blockB.s0, acol0, _result ); \
+ _result = mad( (Dtype8)_blockB.s1, acol1, _result ); \
+ _result = mad( (Dtype8)_blockB.s2, acol2, _result ); \
+ _result = mad( (Dtype8)_blockB.s3, acol3, _result ); \
+ _result = mad( (Dtype8)_blockB.s4, acol4, _result ); \
+ _result = mad( (Dtype8)_blockB.s5, acol5, _result ); \
+ _result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
+ _result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
+ }
+
+#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+ __read_only image2d_t A, \
+ MATB_PARAMETER, \
+ MATC_PARAMETER, \
+ KERNEL_ARG_DTYPE alpha_in, \
+ KERNEL_ARG_DTYPE beta_in, \
+ int padded_k, \
+ int k, \
+ int isFirstColBlock) \
+{ \
+ const Dtype alpha = (Dtype)alpha_in; \
+ const Dtype beta = (Dtype)beta_in; \
+ const int group_x = get_group_id(0); \
+ const int group_y = get_group_id(1); \
+ Dtype8 blockAxB00 = 0.0f; \
+ Dtype8 blockAxB01 = 0.0f; \
+ Dtype8 blockAxB02 = 0.0f; \
+ Dtype8 blockAxB03 = 0.0f; \
+ int2 coordA = (int2)( 0, group_y * TILE_M ); \
+ int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+ do \
+ { \
+ Dtype8 blockB00; \
+ BLOCKB_READ8(blockB00, B, coordB); \
+ int2 coordATemp = coordA; \
+ Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
+ Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
+ Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
+ Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT; \
+ MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
+ } \
+ while( coordB.x < padded_k / VECSIZE ); \
+ GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+ int2 _coordBTemp = _coordB; \
+ _coordBTemp.y += get_local_id(0); \
+ _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_NT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0
+GEMM_NT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0
+GEMM_NT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0
+GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+ int2 _coordBTemp = _coordB; \
+ _coordBTemp.y += get_local_id(0); \
+ const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
+ _blockb = vload8(0, B_read); \
+ _coordB.x += TILE_K;
+
+#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
+
+GEMM_NT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0
+GEMM_NT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0
+GEMM_NT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0
+GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+ int2 _coordBTemp = _coordB; \
+ _coordBTemp.y += get_local_id(0); \
+ Dtype4 temp; \
+ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s0 = temp.s0; \
+ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s1 = temp.s0; \
+ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s2 = temp.s0; \
+ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s3 = temp.s0; \
+ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s4 = temp.s0; \
+ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s5 = temp.s0; \
+ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s6 = temp.s0; \
+ temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s7 = temp.s0; \
+ _coordB.x += 8;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_NT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0
+GEMM_NT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0
+GEMM_NT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0
+GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#undef MULTIPLY_BLOCKS_8x8
+#undef TRANSPOSE_BLOCK_8
+#undef GEMM_NT
+
+//The same as GEMM_TN.
+#define TRANSPOSE_BLOCK_8(_vec, _col) \
+ (Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \
+ intel_sub_group_shuffle(_vec, _col + 1), \
+ intel_sub_group_shuffle(_vec, _col + 2), \
+ intel_sub_group_shuffle(_vec, _col + 3), \
+ intel_sub_group_shuffle(_vec, _col + 4), \
+ intel_sub_group_shuffle(_vec, _col + 5), \
+ intel_sub_group_shuffle(_vec, _col + 6), \
+ intel_sub_group_shuffle(_vec, _col + 7) );
+
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col ) \
+ { \
+ const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA.s0, _col ); \
+ const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA.s1, _col ); \
+ const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA.s2, _col ); \
+ const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA.s3, _col ); \
+ const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA.s4, _col ); \
+ const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA.s5, _col ); \
+ const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA.s6, _col ); \
+ const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA.s7, _col ); \
+ _result = mad( (Dtype8)_blockB.s0, acol0, _result ); \
+ _result = mad( (Dtype8)_blockB.s1, acol1, _result ); \
+ _result = mad( (Dtype8)_blockB.s2, acol2, _result ); \
+ _result = mad( (Dtype8)_blockB.s3, acol3, _result ); \
+ _result = mad( (Dtype8)_blockB.s4, acol4, _result ); \
+ _result = mad( (Dtype8)_blockB.s5, acol5, _result ); \
+ _result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
+ _result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
+ }
+
+#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+ __read_only image2d_t A, \
+ MATB_PARAMETER, \
+ MATC_PARAMETER, \
+ KERNEL_ARG_DTYPE alpha_in, \
+ KERNEL_ARG_DTYPE beta_in, \
+ int padded_k, \
+ int k, \
+ int isFirstColBlock) \
+{ \
+ const Dtype alpha = (Dtype)alpha_in; \
+ const Dtype beta = (Dtype)beta_in; \
+ const int group_x = get_group_id(0); \
+ const int group_y = get_group_id(1); \
+ Dtype8 blockAxB00 = 0.0f; \
+ Dtype8 blockAxB01 = 0.0f; \
+ Dtype8 blockAxB02 = 0.0f; \
+ Dtype8 blockAxB03 = 0.0f; \
+ int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
+ int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
+ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+ do \
+ { \
+ Dtype8 blockB00; \
+ BLOCKB_READ8(blockB00, B, coordB); \
+ int2 coordATemp = coordA; \
+ Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \
+ Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \
+ Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \
+ Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K; \
+ MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00 , blockB00, 0 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01 , blockB00, 0 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02 , blockB00, 0 ); \
+ MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03 , blockB00, 0 ); \
+ } \
+ while( coordB.x < padded_k / VECSIZE ); \
+ GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
+}
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+ int2 _coordBTemp = _coordB; \
+ _coordBTemp.y += get_local_id(0); \
+ _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_TT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0
+GEMM_TT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0
+GEMM_TT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0
+GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+ int2 _coordBTemp = _coordB; \
+ _coordBTemp.y += get_local_id(0); \
+ const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
+ _blockb = vload8(0, B_read); \
+ _coordB.x += TILE_K;
+
+#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
+
+GEMM_TT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0
+GEMM_TT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0
+GEMM_TT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0
+GEMM_TT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+ int2 _coordBTemp = _coordB; \
+ _coordBTemp.y += get_local_id(0); \
+ Dtype4 temp; \
+ temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s0 = temp.s0; \
+ temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s1 = temp.s0; \
+ temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s2 = temp.s0; \
+ temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s3 = temp.s0; \
+ temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s4 = temp.s0; \
+ temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s5 = temp.s0; \
+ temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s6 = temp.s0; \
+ temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+ _blockb.s7 = temp.s0; \
+ _coordB.x += 8;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_TT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0
+GEMM_TT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0
+GEMM_TT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0
+GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#undef MULTIPLY_BLOCKS_8x8
+#undef TRANSPOSE_BLOCK_8
+#undef GEMM_TT
+
+#undef TILE_M
+#undef TILE_K
+#undef TILE_N
+#undef SUBGROUP_BLOCK_READ8
+#undef READ_IMAGE
+#undef SIZE_OF_ELEMENT
+
+__kernel void TEMPLATE(gemm_buffer_copy_image_transpose,Dtype)(
+ __global Dtype* A,
+ __write_only image2d_t ImA,
+ int offA,
+ int width,
+ int height,
+ int ldA)
+{
+ const int gidx = get_global_id(0);
+ const int gidy = get_global_id(1);
+ int2 coord_dst = (int2)(gidx, gidy);
+ __global Dtype* A_off = A + offA;
+ Dtype srcA = A_off[gidy * ldA + gidx];
+ write_imagef(ImA, coord_dst, (Dtype4)srcA);
+}
+
+__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose,Dtype)(
+ __global Dtype* A,
+ __write_only image2d_t ImA,
+ int offA,
+ int width,
+ int height,
+ int ldA)
+{
+ const int gidx = get_global_id(0);
+ const int gidy = get_global_id(1);
+ int2 coord_dst = (int2)(gidx, gidy);
+ if (gidx >= width || gidy >= height) {
+ write_imageui(ImA, coord_dst, (uint4)0);
+ return;
+ }
+ __global Dtype* A_off = A + offA;
+ uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));
+ write_imageui(ImA, coord_dst, srcA);
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+__kernel void TEMPLATE(axpy,Dtype)(const int n, const Dtype alpha, __global const Dtype* x,
+ const int offx, __global Dtype* y,
+ const int offy) {
+ for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
+ Dtype src = x[offx + index];
+ Dtype dst = y[offy + index];
+ y[offy + index] = alpha * src + dst;
+ }
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+__kernel void TEMPLATE(matvec_mul4,Dtype)(
+ __global const float * A,
+ int offA,
+ unsigned int A_col_size,
+ unsigned int trail_item,
+ __global const float * v,
+ int offv,
+ float alpha,
+ float beta,
+ __global float4 * result,
+ int offr,
+ __local float4 * work)
+{
+ unsigned int row_gid = get_group_id(0);
+ unsigned int lid = get_local_id(0);
+ const __global float *src0_read = A + row_gid * 4 * A_col_size + offA;
+ const __global float *src1_read = v + offv;
+ result = (__global float4*)((__global float*)result + offr);
+ float4 dot0 = (float4)(0.f);
+ float4 dot1 = (float4)(0.f);
+ float4 dot2 = (float4)(0.f);
+ float4 dot3 = (float4)(0.f);
+
+ unsigned int i = lid;
+ while( i < A_col_size / 4) {
+ const float4 a0 = vload4(i, src0_read);
+ const float4 a1 = vload4(i, src0_read + A_col_size);
+ const float4 a2 = vload4(i, src0_read + 2 * A_col_size);
+ const float4 a3 = vload4(i, src0_read + 3 * A_col_size);
+
+ const float4 b0 = vload4(i, src1_read);
+
+ dot0 += a0 * b0;
+ dot1 += a1 * b0;
+ dot2 += a2 * b0;
+ dot3 += a3 * b0;
+
+ i += get_local_size(0);
+ }
+
+ work[lid].s0 = dot0.x + dot0.y + dot0.z + dot0.w;
+ work[lid].s1 = dot1.x + dot1.y + dot1.z + dot1.w;
+ work[lid].s2 = dot2.x + dot2.y + dot2.z + dot2.w;
+ work[lid].s3 = dot3.x + dot3.y + dot3.z + dot3.w;
+
+ if(i == A_col_size / 4)
+ {
+ if(trail_item != 0)
+ {
+ const __global float *src0_trail = src0_read + i * 4;
+ const __global float *src1_trail = src1_read + i * 4;
+ for(unsigned int i = 0; i < trail_item; ++i) {
+ const float at0 = src0_trail[i];
+ const float at1 = src0_trail[i + A_col_size];
+ const float at2 = src0_trail[i + 2 * A_col_size];
+ const float at3 = src0_trail[i + 3 * A_col_size];
+
+ const float bt = src1_trail[i];
+
+ work[lid].s0 += at0 * bt;
+ work[lid].s1 += at1 * bt;
+ work[lid].s2 += at2 * bt;
+ work[lid].s3 += at3 * bt;
+ }
+ }
+
+ }
+
+ for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(lid < stride)
+ work[lid] += work[lid+stride];
+ }
+ if(lid == 0) {
+ if(beta == (Dtype)0)
+ result[row_gid] = alpha * work[0];
+ else
+ result[row_gid] = alpha * work[0] + beta * result[row_gid];
+ }
+}
+
+/* This kernel used for the trailing rows when row_of_A %4 !=0 */
+__kernel void TEMPLATE(matvec_mul1,Dtype)(
+ __global const float * A,
+ int offA,
+ unsigned int A_col_size,
+ unsigned int row_offset,
+ unsigned int trail_item,
+ __global const float * v,
+ int offv,
+ float alpha,
+ float beta,
+ __global float * result,
+ int offr,
+ __local float * work)
+{
+ unsigned int row_gid = get_group_id(0);
+ unsigned int lid = get_local_id(0);
+
+ const __global float *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
+ const __global float *src1_read = v + + offv;
+ result = result + offr;
+ float4 dot0 = (float4)(0.f);
+
+ unsigned int i = lid;
+ while( i < A_col_size / 4)
+ {
+ const float4 a0 = vload4(i, src0_read);
+ const float4 b0 = vload4(i, src1_read);
+
+ dot0 += a0 * b0;
+ i += get_local_size(0);
+ }
+
+ work[lid] = dot0.x + dot0.y + dot0.z + dot0.w;
+
+ if(i == A_col_size / 4)
+ {
+ if(trail_item != 0)
+ {
+ const __global float *src0_trail = src0_read + i * 4;
+ const __global float *src1_trail = src1_read + i * 4;
+ for(unsigned int i = 0; i < trail_item; ++i) {
+ const float at0 = src0_trail[i];
+ const float bt = src1_trail[i];
+
+ work[lid] += at0 * bt;
+ }
+ }
+
+ }
+ for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if(lid < stride)
+ work[lid] += work[lid+stride];
+ }
+
+ if(lid == 0) {
+ if(beta == (Dtype)0) {
+ result[row_gid+row_offset] = alpha * work[0];
+ } else {
+ result[row_gid+row_offset] *= beta;
+ result[row_gid+row_offset] += alpha * work[0];
+ }
+ }
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
+ const int num, const int channels,
+ const int height, const int width, const int size,
+ const Dtype alpha_over_size, const Dtype k,
+ __global Dtype* const out,
+ const Dtype negative_beta) {
+ for (int index = get_global_id(0); index < nthreads;
+ index += get_global_size(0)) {
+ // find out the local offset
+ const int w = index % width;
+ const int h = (index / width) % height;
+ const int n = index / width / height;
+ const int offset = (n * channels * height + h) * width + w;
+ const int step = height * width;
+ __global const Dtype* in_off = in + offset;
+ __global Dtype* out_off = out + offset;
+ Dtype scale_val;
+ int head = 0;
+ const int pre_pad = (size - 1) / 2;
+ const int post_pad = size - pre_pad - 1;
+ Dtype accum_scale = 0;
+ // fill the scale at [n, :, h, w]
+ // accumulate values
+ while (head < post_pad && head < channels) {
+ accum_scale += in_off[head * step] * in_off[head * step];
+ ++head;
+ }
+ // both add and subtract
+ while (head < channels) {
+ accum_scale += in_off[head * step] * in_off[head * step];
+ if (head - size >= 0) {
+ accum_scale -= in_off[(head - size) * step]
+ * in_off[(head - size) * step];
+ }
+ scale_val = k + accum_scale * alpha_over_size;
+ out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+ ++head;
+ }
+ // subtract only
+ while (head < channels + post_pad) {
+ if (head - size >= 0) {
+ accum_scale -= in_off[(head - size) * step]
+ * in_off[(head - size) * step];
+ }
+ scale_val = k + accum_scale * alpha_over_size;
+ out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((float)scale_val, (float)negative_beta);
+ ++head;
+ }
+ }
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+void TEMPLATE(max_pool_forward_impl, Dtype)(
+ const int nthreads, __global const Dtype* bottom_data, const int num,
+ const int channels, const int height, const int width,
+ const int pooled_height, const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+ const int pad_w,
+ __global Dtype* top_data,
+ const int use_mask, __global int* mask, __global Dtype* top_mask, bool no_mask)
+{
+ for (int index = get_global_id(0); index < nthreads;
+ index += get_global_size(0))
+ {
+ const int pw = index % pooled_width;
+ const int ph = (index / pooled_width) % pooled_height;
+ const int c = (index / pooled_width / pooled_height) % channels;
+ const int n = index / pooled_width / pooled_height / channels;
+ int hstart = ph * stride_h - pad_h;
+ int wstart = pw * stride_w - pad_w;
+ const int hend = min(hstart + kernel_h, height);
+ const int wend = min(wstart + kernel_w, width);
+ hstart = max(hstart, (int)0);
+ wstart = max(wstart, (int)0);
+ Dtype maxval = -FLT_MAX;
+ int maxidx = -1;
+ __global const Dtype* bottom_slice = bottom_data
+ + (n * channels + c) * height * width;
+ for (int h = hstart; h < hend; ++h) {
+ for (int w = wstart; w < wend; ++w) {
+ if (bottom_slice[h * width + w] > maxval) {
+ maxidx = h * width + w;
+ maxval = bottom_slice[maxidx];
+ }
+ }
+ }
+ top_data[index] = maxval;
+ if (!no_mask) {
+ if (use_mask == 1) {
+ mask[index] = maxidx;
+ } else {
+ top_mask[index] = maxidx;
+ }
+ }
+ }
+}
+
+__kernel void TEMPLATE(max_pool_forward, Dtype)(
+ const int nthreads, __global const Dtype* bottom_data, const int num,
+ const int channels, const int height, const int width,
+ const int pooled_height, const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+ const int pad_w,
+ __global Dtype* top_data,
+ const int use_mask, __global int* mask, __global Dtype* top_mask)
+{
+ TEMPLATE(max_pool_forward_impl, Dtype)(
+ nthreads, bottom_data, num, channels, height, width,
+ pooled_height, pooled_width, kernel_h,
+ kernel_w, stride_h, stride_w, pad_h, pad_w, top_data, use_mask, mask, top_mask, false
+ );
+}
+
+__kernel void TEMPLATE(ave_pool_forward, Dtype)(
+ const int nthreads, __global const Dtype* const bottom_data, const int num,
+ const int channels, const int height, const int width,
+ const int pooled_height, const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h, const int stride_w, const int pad_h,
+ const int pad_w, __global Dtype* top_data)
+{
+ for (int index = get_global_id(0); index < nthreads;
+ index += get_global_size(0))
+ {
+ {
+ const int pw = index % pooled_width;
+ const int ph = (index / pooled_width) % pooled_height;
+ const int c = (index / pooled_width / pooled_height) % channels;
+ const int n = index / pooled_width / pooled_height / channels;
+ int hstart = ph * stride_h - pad_h;
+ int wstart = pw * stride_w - pad_w;
+ int hend = min(hstart + kernel_h, height + pad_h);
+ int wend = min(wstart + kernel_w, width + pad_w);
+ const int pool_size = (hend - hstart) * (wend - wstart);
+ hstart = max(hstart, (int)0);
+ wstart = max(wstart, (int)0);
+ hend = min(hend, height);
+ wend = min(wend, width);
+ Dtype aveval = 0;
+ __global const Dtype* bottom_slice = bottom_data
+ + (n * channels + c) * height * width;
+ for (int h = hstart; h < hend; ++h) {
+ for (int w = wstart; w < wend; ++w) {
+ aveval += bottom_slice[h * width + w];
+ }
+ }
+ top_data[index] = aveval / pool_size;
+ }
+ }
+}
+
+__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
+ const int nthreads, __global const Dtype* const bottom_data, const int num,
+ const int channels, const int height, const int width,
+ const int pooled_height, const int pooled_width, const int kernel_h,
+ const int kernel_w, const int stride_h, const int stride_w,
+ __global Dtype* top_data)
+{
+ for (int index = get_global_id(0); index < nthreads;
+ index += get_global_size(0))
+ {
+ const int pw = index % pooled_width;
+ const int ph = (index / pooled_width) % pooled_height;
+ const int c = (index / pooled_width / pooled_height) % channels;
+ const int n = index / pooled_width / pooled_height / channels;
+ const int hstart = ph * stride_h;
+ const int hend = min(hstart + kernel_h, height);
+ const int wstart = pw * stride_w;
+ const int wend = min(wstart + kernel_w, width);
+ // We set cumsum to be 0 to avoid divide-by-zero problems
+ Dtype cumsum = FLT_MIN;
+ Dtype cumvalues = 0.;
+ __global const Dtype* bottom_slice = bottom_data
+ + (n * channels + c) * height * width;
+ // First pass: get sum
+ for (int h = hstart; h < hend; ++h) {
+ for (int w = wstart; w < wend; ++w) {
+ cumsum += bottom_slice[h * width + w];
+ cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+ }
+ }
+ top_data[index] = cumvalues / cumsum;
+ }
+}
if(index < count) {
int n = index / channels / spatial_dim;
int s = index % spatial_dim;
- data[index] /= channel_sum[n * spatial_dim + s];
+ T v = data[index] / channel_sum[n * spatial_dim + s];
+#ifdef LOG_SOFTMAX
+ v = log(v);
+#endif
+ data[index] = v;
}
-}
\ No newline at end of file
+}
--- /dev/null
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+// By downloading, copying, installing or using the software you agree to this license.
+// If you do not agree to this license, do not download, install,
+// copy or use the software.
+//
+//
+// License Agreement
+// For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistribution's of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistribution's in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * The name of the copyright holders may not be used to endorse or promote products
+// derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+#if defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION cl_intel_subgroups : enable
+#endif
+
+__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
+ const int spatial_dim,
+ __global Dtype* scale,
+ __global const Dtype* data,
+ __global Dtype* out,
+ __local Dtype *out_tmp,
+ __local Dtype *scale_tmp,
+ __local Dtype *group_tmp) {
+
+ int n = get_global_id(1);
+ for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+ get_global_size(0), ++s) {
+ float maxval = -FLT_MAX;
+ for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+ Dtype tmp = data[(n * channels + c) * spatial_dim + s];
+ maxval = max((Dtype)tmp, (Dtype)maxval);
+ }
+ maxval = sub_group_reduce_max(maxval * 100000);
+ //if (get_sub_group_local_id() == 0)
+ group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+ get_global_size(0)) {
+ int s = index / get_max_sub_group_size();
+ Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+ //if (get_sub_group_local_id() == 0)
+ scale_tmp[s] = maxval / 100000;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (int index = get_global_id(0); index < channels * spatial_dim;
+ index += get_global_size(0)) {
+ int s = index % spatial_dim;
+ out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]);
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+ get_global_size(0), ++s) {
+ Dtype sum = 0;
+ for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+ sum += out_tmp[c * spatial_dim + s];
+ }
+ sum = sub_group_reduce_add(sum * 100000);
+ group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+ get_global_size(0)) {
+ int s = index / get_max_sub_group_size();
+ Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+ //if (get_sub_group_local_id() == 0)
+ scale_tmp[s] = sum / 100000;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ for (int index = get_global_id(0); index < channels * spatial_dim;
+ index += get_global_size(0)) {
+ int s = index % spatial_dim;
+ out[n * channels * spatial_dim + index] = out_tmp[index] / scale_tmp[s];
+ }
+}
+
+__kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
+ const int spatial_dim,
+ __global Dtype* scale,
+ __global const Dtype* data,
+ __global Dtype* out) {
+
+ int n = get_global_id(1);
+ __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
+ for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+ get_global_size(0), ++s) {
+ float maxval = -FLT_MAX;
+ for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+ Dtype tmp = data[(n * channels + c) * spatial_dim + s];
+ maxval = max((Dtype)tmp, (Dtype)maxval);
+ }
+ maxval = sub_group_reduce_max(maxval * 100000);
+ //if (get_sub_group_local_id() == 0)
+ group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+ get_global_size(0)) {
+ int s = index / get_max_sub_group_size();
+ Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+ //if (get_sub_group_local_id() == 0)
+ scale[n * spatial_dim + s] = maxval / 100000;
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ for (int index = get_global_id(0); index < channels * spatial_dim;
+ index += get_global_size(0)) {
+ int s = index % spatial_dim;
+ out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]);
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+ get_global_size(0), ++s) {
+ Dtype sum = 0;
+ for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+ sum += out[n * channels * spatial_dim + c * spatial_dim + s];
+ }
+ sum = sub_group_reduce_add(sum * 100000);
+ group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+ get_global_size(0)) {
+ int s = index / get_max_sub_group_size();
+ Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+ //if (get_sub_group_local_id() == 0)
+ scale[n * spatial_dim + s] = sum / 100000;
+ }
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ for (int index = get_global_id(0); index < channels * spatial_dim;
+ index += get_global_size(0)) {
+ int s = index % spatial_dim;
+ out[n * channels * spatial_dim + index] /= scale[n * spatial_dim + s];
+ }
+}
//M*/
#include <opencv2/core.hpp>
+#include <opencv2/core/ocl.hpp>
+#include <opencv2/core/opencl/ocl_defs.hpp>
#include <opencv2/core/utils/trace.hpp>
#include <opencv2/core/softfloat.hpp> // int32_t (MSVS 2010-2013)
#include "cvconfig.h"
normAssert(out, ref);
}
+OCL_TEST(Reproducibility_GoogLeNet, Accuracy)
+{
+ Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
+ findDataFile("dnn/bvlc_googlenet.caffemodel", false));
+
+ net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+ net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+ std::vector<Mat> inpMats;
+ inpMats.push_back( imread(_tf("googlenet_0.png")) );
+ inpMats.push_back( imread(_tf("googlenet_1.png")) );
+ ASSERT_TRUE(!inpMats[0].empty() && !inpMats[1].empty());
+
+ net.setInput(blobFromImages(inpMats, 1.0f, Size(), Scalar(), false), "data");
+ Mat out = net.forward("prob");
+
+ Mat ref = blobFromNPY(_tf("googlenet_prob.npy"));
+ normAssert(out, ref);
+}
+
TEST(IntermediateBlobs_GoogLeNet, Accuracy)
{
Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
}
}
+OCL_TEST(IntermediateBlobs_GoogLeNet, Accuracy)
+{
+ Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
+ findDataFile("dnn/bvlc_googlenet.caffemodel", false));
+
+ net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+ net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+ std::vector<String> blobsNames;
+ blobsNames.push_back("conv1/7x7_s2");
+ blobsNames.push_back("conv1/relu_7x7");
+ blobsNames.push_back("inception_4c/1x1");
+ blobsNames.push_back("inception_4c/relu_1x1");
+ std::vector<Mat> outs;
+ Mat in = blobFromImage(imread(_tf("googlenet_0.png")), 1.0f, Size(), Scalar(), false);
+ net.setInput(in, "data");
+ net.forward(outs, blobsNames);
+ CV_Assert(outs.size() == blobsNames.size());
+
+ for (int i = 0; i < blobsNames.size(); i++)
+ {
+ std::string filename = blobsNames[i];
+ std::replace( filename.begin(), filename.end(), '/', '#');
+ Mat ref = blobFromNPY(_tf("googlenet_" + filename + ".npy"));
+
+ normAssert(outs[i], ref, "", 1E-4, 1E-2);
+ }
+}
+
TEST(SeveralCalls_GoogLeNet, Accuracy)
{
Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
normAssert(outs[0], ref, "", 1E-4, 1E-2);
}
+OCL_TEST(SeveralCalls_GoogLeNet, Accuracy)
+{
+ Net net = readNetFromCaffe(findDataFile("dnn/bvlc_googlenet.prototxt", false),
+ findDataFile("dnn/bvlc_googlenet.caffemodel", false));
+
+ net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+ net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+ std::vector<Mat> inpMats;
+ inpMats.push_back( imread(_tf("googlenet_0.png")) );
+ inpMats.push_back( imread(_tf("googlenet_1.png")) );
+ ASSERT_TRUE(!inpMats[0].empty() && !inpMats[1].empty());
+
+ net.setInput(blobFromImages(inpMats, 1.0f, Size(), Scalar(), false), "data");
+ Mat out = net.forward();
+
+ Mat ref = blobFromNPY(_tf("googlenet_prob.npy"));
+ normAssert(out, ref);
+
+ std::vector<String> blobsNames;
+ blobsNames.push_back("conv1/7x7_s2");
+ std::vector<Mat> outs;
+ Mat in = blobFromImage(inpMats[0], 1.0f, Size(), Scalar(), false);
+ net.setInput(in, "data");
+ net.forward(outs, blobsNames);
+ CV_Assert(outs.size() == blobsNames.size());
+
+ ref = blobFromNPY(_tf("googlenet_conv1#7x7_s2.npy"));
+
+ normAssert(outs[0], ref, "", 1E-4, 1E-2);
+}
+
}
}
-void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool useCommonInputBlob = true)
+void testLayerUsingCaffeModels(String basename, int targetId = DNN_TARGET_CPU,
+ bool useCaffeModel = false, bool useCommonInputBlob = true)
{
String prototxt = _tf(basename + ".prototxt");
String caffemodel = _tf(basename + ".caffemodel");
Net net = readNetFromCaffe(prototxt, (useCaffeModel) ? caffemodel : String());
ASSERT_FALSE(net.empty());
+ net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+ net.setPreferableTarget(targetId);
+
Mat inp = blobFromNPY(inpfile);
Mat ref = blobFromNPY(outfile);
TEST(Layer_Test_Softmax, Accuracy)
{
- testLayerUsingCaffeModels("layer_softmax");
+ testLayerUsingCaffeModels("layer_softmax");
+}
+
+OCL_TEST(Layer_Test_Softmax, Accuracy)
+{
+ testLayerUsingCaffeModels("layer_softmax", DNN_TARGET_OPENCL);
}
TEST(Layer_Test_LRN_spatial, Accuracy)
{
- testLayerUsingCaffeModels("layer_lrn_spatial");
+ testLayerUsingCaffeModels("layer_lrn_spatial");
+}
+
+OCL_TEST(Layer_Test_LRN_spatial, Accuracy)
+{
+ testLayerUsingCaffeModels("layer_lrn_spatial", DNN_TARGET_OPENCL);
}
TEST(Layer_Test_LRN_channels, Accuracy)
{
- testLayerUsingCaffeModels("layer_lrn_channels");
+ testLayerUsingCaffeModels("layer_lrn_channels");
+}
+
+OCL_TEST(Layer_Test_LRN_channels, Accuracy)
+{
+ testLayerUsingCaffeModels("layer_lrn_channels", DNN_TARGET_OPENCL);
}
TEST(Layer_Test_Convolution, Accuracy)
{
- testLayerUsingCaffeModels("layer_convolution", true);
+ testLayerUsingCaffeModels("layer_convolution", DNN_TARGET_CPU, true);
+}
+
+OCL_TEST(Layer_Test_Convolution, Accuracy)
+{
+ testLayerUsingCaffeModels("layer_convolution", DNN_TARGET_OPENCL, true);
}
TEST(Layer_Test_DeConvolution, Accuracy)
{
- testLayerUsingCaffeModels("layer_deconvolution", true, false);
+ testLayerUsingCaffeModels("layer_deconvolution", DNN_TARGET_CPU, true, false);
}
TEST(Layer_Test_InnerProduct, Accuracy)
{
- testLayerUsingCaffeModels("layer_inner_product", true);
+ testLayerUsingCaffeModels("layer_inner_product", DNN_TARGET_CPU, true);
+}
+
+OCL_TEST(Layer_Test_InnerProduct, Accuracy)
+{
+ testLayerUsingCaffeModels("layer_inner_product", DNN_TARGET_OPENCL, true);
}
TEST(Layer_Test_Pooling_max, Accuracy)
{
- testLayerUsingCaffeModels("layer_pooling_max");
+ testLayerUsingCaffeModels("layer_pooling_max");
+}
+
+OCL_TEST(Layer_Test_Pooling_max, Accuracy)
+{
+ testLayerUsingCaffeModels("layer_pooling_max", DNN_TARGET_OPENCL);
}
TEST(Layer_Test_Pooling_ave, Accuracy)
{
- testLayerUsingCaffeModels("layer_pooling_ave");
+ testLayerUsingCaffeModels("layer_pooling_ave");
+}
+
+OCL_TEST(Layer_Test_Pooling_ave, Accuracy)
+{
+ testLayerUsingCaffeModels("layer_pooling_ave", DNN_TARGET_OPENCL);
}
TEST(Layer_Test_MVN, Accuracy)
{
- testLayerUsingCaffeModels("layer_mvn");
+ testLayerUsingCaffeModels("layer_mvn");
}
void testReshape(const MatShape& inputShape, const MatShape& targetShape,
TEST(Layer_Test_BatchNorm, Accuracy)
{
- testLayerUsingCaffeModels("layer_batch_norm", true);
+ testLayerUsingCaffeModels("layer_batch_norm", DNN_TARGET_CPU, true);
}
TEST(Layer_Test_ReLU, Accuracy)
{
- testLayerUsingCaffeModels("layer_relu");
+ testLayerUsingCaffeModels("layer_relu");
+}
+
+OCL_TEST(Layer_Test_ReLU, Accuracy)
+{
+ testLayerUsingCaffeModels("layer_relu", DNN_TARGET_OPENCL);
}
TEST(Layer_Test_Dropout, Accuracy)
{
- testLayerUsingCaffeModels("layer_dropout");
+ testLayerUsingCaffeModels("layer_dropout");
}
TEST(Layer_Test_Concat, Accuracy)
{
- testLayerUsingCaffeModels("layer_concat");
+ testLayerUsingCaffeModels("layer_concat");
+}
+
+OCL_TEST(Layer_Test_Concat, Accuracy)
+{
+ testLayerUsingCaffeModels("layer_concat", DNN_TARGET_OPENCL);
}
//template<typename XMat>
#include "test_precomp.hpp"
#include "npy_blob.hpp"
#include <opencv2/dnn/shape_utils.hpp>
+#include <opencv2/ts/ocl_test.hpp>
namespace cvtest
{
ASSERT_FALSE(net.empty());
}
-static void runTorchNet(String prefix, String outLayerName = "",
+static void runTorchNet(String prefix, int targetId = DNN_TARGET_CPU, String outLayerName = "",
bool check2ndBlob = false, bool isBinary = false)
{
String suffix = (isBinary) ? ".dat" : ".txt";
Net net = readNetFromTorch(_tf(prefix + "_net" + suffix), isBinary);
ASSERT_FALSE(net.empty());
+ net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+ net.setPreferableTarget(targetId);
+
Mat inp, outRef;
ASSERT_NO_THROW( inp = readTorchBlob(_tf(prefix + "_input" + suffix), isBinary) );
ASSERT_NO_THROW( outRef = readTorchBlob(_tf(prefix + "_output" + suffix), isBinary) );
runTorchNet("net_conv");
}
+OCL_TEST(Torch_Importer, run_convolution)
+{
+ runTorchNet("net_conv", DNN_TARGET_OPENCL);
+}
+
TEST(Torch_Importer, run_pool_max)
{
- runTorchNet("net_pool_max", "", true);
+ runTorchNet("net_pool_max", DNN_TARGET_CPU, "", true);
+}
+
+OCL_TEST(Torch_Importer, run_pool_max)
+{
+ runTorchNet("net_pool_max", DNN_TARGET_OPENCL, "", true);
}
TEST(Torch_Importer, run_pool_ave)
runTorchNet("net_pool_ave");
}
+OCL_TEST(Torch_Importer, run_pool_ave)
+{
+ runTorchNet("net_pool_ave", DNN_TARGET_OPENCL);
+}
+
TEST(Torch_Importer, run_reshape)
{
runTorchNet("net_reshape");
runTorchNet("net_reshape_batch");
runTorchNet("net_reshape_single_sample");
- runTorchNet("net_reshape_channels", "", false, true);
+ runTorchNet("net_reshape_channels", DNN_TARGET_CPU, "", false, true);
}
TEST(Torch_Importer, run_linear)
TEST(Torch_Importer, run_paralel)
{
- runTorchNet("net_parallel", "l5_torchMerge");
+ runTorchNet("net_parallel", DNN_TARGET_CPU, "l5_torchMerge");
}
TEST(Torch_Importer, run_concat)
{
- runTorchNet("net_concat", "l5_torchMerge");
- runTorchNet("net_depth_concat", "", false, true);
+ runTorchNet("net_concat", DNN_TARGET_CPU, "l5_torchMerge");
+ runTorchNet("net_depth_concat", DNN_TARGET_CPU, "", false, true);
+}
+
+OCL_TEST(Torch_Importer, run_concat)
+{
+ runTorchNet("net_concat", DNN_TARGET_OPENCL, "l5_torchMerge");
+ runTorchNet("net_depth_concat", DNN_TARGET_OPENCL, "", false, true);
}
TEST(Torch_Importer, run_deconv)
runTorchNet("net_softmax_spatial");
}
+OCL_TEST(Torch_Importer, net_softmax)
+{
+ runTorchNet("net_softmax", DNN_TARGET_OPENCL);
+ runTorchNet("net_softmax_spatial", DNN_TARGET_OPENCL);
+}
+
TEST(Torch_Importer, net_logsoftmax)
{
runTorchNet("net_logsoftmax");
runTorchNet("net_logsoftmax_spatial");
}
+OCL_TEST(Torch_Importer, net_logsoftmax)
+{
+ runTorchNet("net_logsoftmax", DNN_TARGET_OPENCL);
+ runTorchNet("net_logsoftmax_spatial", DNN_TARGET_OPENCL);
+}
+
TEST(Torch_Importer, net_lp_pooling)
{
- runTorchNet("net_lp_pooling_square", "", false, true);
- runTorchNet("net_lp_pooling_power", "", false, true);
+ runTorchNet("net_lp_pooling_square", DNN_TARGET_CPU, "", false, true);
+ runTorchNet("net_lp_pooling_power", DNN_TARGET_CPU, "", false, true);
}
TEST(Torch_Importer, net_conv_gemm_lrn)
{
- runTorchNet("net_conv_gemm_lrn", "", false, true);
+ runTorchNet("net_conv_gemm_lrn", DNN_TARGET_CPU, "", false, true);
}
TEST(Torch_Importer, net_inception_block)
{
- runTorchNet("net_inception_block", "", false, true);
+ runTorchNet("net_inception_block", DNN_TARGET_CPU, "", false, true);
}
TEST(Torch_Importer, net_normalize)
{
- runTorchNet("net_normalize", "", false, true);
+ runTorchNet("net_normalize", DNN_TARGET_CPU, "", false, true);
}
TEST(Torch_Importer, net_padding)
{
- runTorchNet("net_padding", "", false, true);
- runTorchNet("net_spatial_zero_padding", "", false, true);
+ runTorchNet("net_padding", DNN_TARGET_CPU, "", false, true);
+ runTorchNet("net_spatial_zero_padding", DNN_TARGET_CPU, "", false, true);
}
TEST(Torch_Importer, ENet_accuracy)
normAssert(out, outRef);
}
+OCL_TEST(Torch_Importer, OpenFace_accuracy)
+{
+ const string model = findDataFile("dnn/openface_nn4.small2.v1.t7", false);
+ Net net = readNetFromTorch(model);
+
+ net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+ net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+ Mat sample = imread(findDataFile("cv/shared/lena.png", false));
+ Mat sampleF32(sample.size(), CV_32FC3);
+ sample.convertTo(sampleF32, sampleF32.type());
+ sampleF32 /= 255;
+ resize(sampleF32, sampleF32, Size(96, 96), 0, 0, INTER_NEAREST);
+
+ Mat inputBlob = blobFromImage(sampleF32);
+
+ net.setInput(inputBlob);
+ Mat out = net.forward();
+
+ Mat outRef = readTorchBlob(_tf("net_openface_output.dat"), true);
+ normAssert(out, outRef);
+}
+
+OCL_TEST(Torch_Importer, ENet_accuracy)
+{
+ Net net;
+ {
+ const string model = findDataFile("dnn/Enet-model-best.net", false);
+ Ptr<Importer> importer = createTorchImporter(model, true);
+ ASSERT_TRUE(importer != NULL);
+ importer->populateNet(net);
+ }
+
+ net.setPreferableBackend(DNN_BACKEND_DEFAULT);
+ net.setPreferableTarget(DNN_TARGET_OPENCL);
+
+ Mat sample = imread(_tf("street.png", false));
+ Mat inputBlob = blobFromImage(sample, 1./255);
+
+ net.setInput(inputBlob, "");
+ Mat out = net.forward();
+ Mat ref = blobFromNPY(_tf("torch_enet_prob.npy", false));
+ // Due to numerical instability in Pooling-Unpooling layers (indexes jittering)
+ // thresholds for ENet must be changed. Accuracy of resuults was checked on
+ // Cityscapes dataset and difference in mIOU with Torch is 10E-4%
+ normAssert(ref, out, "", 0.00044, 0.44);
+
+ const int N = 3;
+ for (int i = 0; i < N; i++)
+ {
+ net.setInput(inputBlob, "");
+ Mat out = net.forward();
+ normAssert(ref, out, "", 0.00044, 0.44);
+ }
+}
+
}
#endif