From 54c81cbde43d879fbfc9c272a311e3947a327db3 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Thu, 25 Jan 2018 22:15:41 +0800 Subject: [PATCH] eltwise layer SUM op update Signed-off-by: Li Peng --- modules/dnn/src/layers/eltwise_layer.cpp | 54 +++++++++++++----- modules/dnn/src/opencl/eltwise.cl | 98 ++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 14 deletions(-) create mode 100644 modules/dnn/src/opencl/eltwise.cl diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp index 4037573..27bd7ee 100644 --- a/modules/dnn/src/layers/eltwise_layer.cpp +++ b/modules/dnn/src/layers/eltwise_layer.cpp @@ -43,6 +43,7 @@ #include "../precomp.hpp" #include "layers_common.hpp" #include "op_halide.hpp" +#include "opencl_kernels_dnn.hpp" namespace cv { @@ -271,22 +272,47 @@ public: switch (op) { case SUM: - if (coeffs.empty()) - { - add(inputs[0], inputs[1], outputs[0]); - for (int i = 2; i < inputs.size(); ++i) - add(outputs[0], inputs[i], outputs[0]); - } - else { - UMat mul0, mul1; - multiply(coeffs[0], inputs[0], mul0); - multiply(coeffs[1], inputs[1], mul1); - add(mul0, mul1, outputs[0]); - for (int i = 2; i < inputs.size(); ++i) + int channels = total(shape(outputs[0]), 0, 2); + int plane_size = total(shape(outputs[0]), 2); + if (channels % 4 == 0 && plane_size % 4 == 0) + { + size_t localsize[] = { 128 }; + size_t globalsize[] = { (size_t)channels / 4 * localsize[0] }; + + for (int i = 0; i < (inputs.size() - 1); ++i) + { + String buildopt = format("-DLOOP=%d", i); + ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt); + int idx = 0; + UMat inpMat = (i == 0) ? inputs[0] : UMat(); + float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i]; + float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1]; + kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0])); + kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1])); + kernel.set(idx++, (int)plane_size); + kernel.set(idx++, (float)coeff1); + kernel.set(idx++, (float)coeff2); + kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0])); + bool ret = kernel.run(1, globalsize, localsize, false); + if (!ret) + return false; + } + } + else { - multiply(coeffs[i], inputs[i], mul0); - add(mul0, outputs[0], outputs[0]); + float coeff1 = coeffs.empty() ? 1.f : coeffs[0]; + float coeff2 = coeffs.empty() ? 1.f : coeffs[1]; + UMat mul0, mul1; + multiply(coeff1, inputs[0], mul0); + multiply(coeff2, inputs[1], mul1); + add(mul0, mul1, outputs[0]); + for (int i = 2; i < inputs.size(); ++i) + { + float coeff = coeffs.empty() ? 1.f : coeffs[i]; + multiply(coeff, inputs[i], mul0); + add(mul0, outputs[0], outputs[0]); + } } } break; diff --git a/modules/dnn/src/opencl/eltwise.cl b/modules/dnn/src/opencl/eltwise.cl new file mode 100644 index 0000000..6f3a374 --- /dev/null +++ b/modules/dnn/src/opencl/eltwise.cl @@ -0,0 +1,98 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2017, Intel Corporation, all rights reserved. +// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#define Dtype float +#define Dtype4 float4 +#define Dtype8 float8 + +__kernel void op_sum4(__global const Dtype * A, + __global const Dtype * B, + unsigned int A_col_size, + const float coeff1, + const float coeff2, + __global Dtype * C) +{ + unsigned int row_gid = get_group_id(0); + unsigned int lid = get_local_id(0); + const __global Dtype *src0_read = A + row_gid * 4 * A_col_size; + const __global Dtype *src1_read = B + row_gid * 4 * A_col_size; + __global Dtype *dst0_read = C + row_gid * 4 * A_col_size; + + Dtype4 a0, a1, a2, a3; + Dtype4 dot0, dot1, dot2, dot3; + unsigned int i = lid; + while( i < A_col_size / 4) + { + const Dtype4 b0 = vload4(i, src1_read); + const Dtype4 b1 = vload4(i, src1_read + A_col_size); + const Dtype4 b2 = vload4(i, src1_read + 2 * A_col_size); + const Dtype4 b3 = vload4(i, src1_read + 3 * A_col_size); + +#if LOOP == 0 + a0 = vload4(i, src0_read); + a1 = vload4(i, src0_read + A_col_size); + a2 = vload4(i, src0_read + 2 * A_col_size); + a3 = vload4(i, src0_read + 3 * A_col_size); + + dot0 = a0 * coeff1 + b0 * coeff2; + dot1 = a1 * coeff1 + b1 * coeff2; + dot2 = a2 * coeff1 + b2 * coeff2; + dot3 = a3 * coeff1 + b3 * coeff2; +#else + a0 = vload4(i, dst0_read); + a1 = vload4(i, dst0_read + A_col_size); + a2 = vload4(i, dst0_read + 2 * A_col_size); + a3 = vload4(i, dst0_read + 3 * A_col_size); + + dot0 = a0 + b0 * coeff2; + dot1 = a1 + b1 * coeff2; + dot2 = a2 + b2 * coeff2; + dot3 = a3 + b3 * coeff2; +#endif + vstore4(dot0, i, dst0_read); + vstore4(dot1, i, dst0_read + A_col_size); + vstore4(dot2, i, dst0_read + 2 * A_col_size); + vstore4(dot3, i, dst0_read + 3 * A_col_size); + + i += get_local_size(0); + } +} -- 2.7.4