From: Li Peng <peng.li@intel.com>
Date: Tue, 5 Dec 2017 15:17:34 +0000 (+0800)
Subject: prior box layer ocl implementation
X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~269^2~2
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=910d7dab1f056ccd6531c8615e4f50e0c1c1c76d;p=platform%2Fupstream%2Fopencv.git

prior box layer ocl implementation

Signed-off-by: Li Peng <peng.li@intel.com>
---

diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
index 5fc852a..575ac5e 100644
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -45,6 +45,7 @@
 #include <float.h>
 #include <algorithm>
 #include <cmath>
+#include "opencl_kernels_dnn.hpp"
 
 namespace cv
 {
@@ -270,11 +271,108 @@ public:
         return false;
     }
 
+#ifdef HAVE_OPENCL
+    bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
+    {
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+
+        int _layerWidth = inputs[0].size[3];
+        int _layerHeight = inputs[0].size[2];
+
+        int _imageWidth = inputs[1].size[3];
+        int _imageHeight = inputs[1].size[2];
+
+        float stepX, stepY;
+        if (_stepX == 0 || _stepY == 0)
+        {
+            stepX = static_cast<float>(_imageWidth) / _layerWidth;
+            stepY = static_cast<float>(_imageHeight) / _layerHeight;
+        } else {
+            stepX = _stepX;
+            stepY = _stepY;
+        }
+
+        if (umat_offsetsX.empty())
+        {
+            Mat offsetsX(1, _offsetsX.size(), CV_32FC1, &_offsetsX[0]);
+            Mat offsetsY(1, _offsetsX.size(), CV_32FC1, &_offsetsY[0]);
+            Mat aspectRatios(1, _aspectRatios.size(), CV_32FC1, &_aspectRatios[0]);
+            Mat variance(1, _variance.size(), CV_32FC1, &_variance[0]);
+
+            offsetsX.copyTo(umat_offsetsX);
+            offsetsY.copyTo(umat_offsetsY);
+            aspectRatios.copyTo(umat_aspectRatios);
+            variance.copyTo(umat_variance);
+
+            int real_numPriors = _numPriors / pow(2, _offsetsX.size() - 1);
+            umat_scales = UMat(1, &real_numPriors, CV_32F, 1.0f);
+        }
+
+        size_t nthreads = _layerHeight * _layerWidth;
+
+        ocl::Kernel kernel("prior_box", ocl::dnn::prior_box_oclsrc);
+        kernel.set(0, (int)nthreads);
+        kernel.set(1, (float)stepX);
+        kernel.set(2, (float)stepY);
+        kernel.set(3, (float)_minSize);
+        kernel.set(4, (float)_maxSize);
+        kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_offsetsX));
+        kernel.set(6, ocl::KernelArg::PtrReadOnly(umat_offsetsY));
+        kernel.set(7, (int)_offsetsX.size());
+        kernel.set(8, ocl::KernelArg::PtrReadOnly(umat_aspectRatios));
+        kernel.set(9, (int)_aspectRatios.size());
+        kernel.set(10, ocl::KernelArg::PtrReadOnly(umat_scales));
+        kernel.set(11, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+        kernel.set(12, (int)_layerHeight);
+        kernel.set(13, (int)_layerWidth);
+        kernel.set(14, (int)_imageHeight);
+        kernel.set(15, (int)_imageWidth);
+        kernel.run(1, &nthreads, NULL, false);
+
+        // clip the prior's coordidate such that it is within [0, 1]
+        if (_clip)
+        {
+            Mat mat = outputs[0].getMat(ACCESS_READ);
+            int aspect_count = (_maxSize > 0) ? 1 : 0;
+            int offset = nthreads * 4 * _offsetsX.size() * (1 + aspect_count + _aspectRatios.size());
+            float* outputPtr = mat.ptr<float>() + offset;
+            int _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
+            for (size_t d = 0; d < _outChannelSize; ++d)
+            {
+                outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
+            }
+        }
+
+        // set the variance.
+        {
+            ocl::Kernel kernel("set_variance", ocl::dnn::prior_box_oclsrc);
+            int offset = total(shape(outputs[0]), 2);
+            size_t nthreads = _layerHeight * _layerWidth * _numPriors;
+            kernel.set(0, (int)nthreads);
+            kernel.set(1, (int)offset);
+            kernel.set(2, (int)_variance.size());
+            kernel.set(3, ocl::KernelArg::PtrReadOnly(umat_variance));
+            kernel.set(4, ocl::KernelArg::PtrWriteOnly(outputs[0]));
+            if (!kernel.run(1, &nthreads, NULL, false))
+                return false;
+        }
+        return true;
+    }
+#endif
+
     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
     {
         CV_TRACE_FUNCTION();
         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 
+        CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
+                   OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
+                   forward_ocl(inputs_arr, outputs_arr, internals_arr))
+
         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
     }
 
@@ -441,6 +539,14 @@ private:
     std::vector<float> _offsetsX;
     std::vector<float> _offsetsY;
 
+#ifdef HAVE_OPENCL
+    UMat umat_offsetsX;
+    UMat umat_offsetsY;
+    UMat umat_aspectRatios;
+    UMat umat_scales;
+    UMat umat_variance;
+#endif
+
     bool _flip;
     bool _clip;
     bool _explicitSizes;
diff --git a/modules/dnn/src/opencl/prior_box.cl b/modules/dnn/src/opencl/prior_box.cl
new file mode 100644
index 0000000..660ccb6
--- /dev/null
+++ b/modules/dnn/src/opencl/prior_box.cl
@@ -0,0 +1,148 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define Dtype float
+#define Dtype4 float4
+
+__kernel void prior_box(const int nthreads,
+                        const Dtype stepX,
+                        const Dtype stepY,
+                        const Dtype _minSize,
+                        const Dtype _maxSize,
+                        __global const Dtype* _offsetsX,
+                        __global const Dtype* _offsetsY,
+                        const int offsetsX_size,
+                        __global const Dtype* _aspectRatios,
+                        const int aspectRatios_size,
+                        __global const Dtype* scales,
+                        __global Dtype* dst,
+                        const int _layerHeight,
+                        const int _layerWidth,
+                        const int imgHeight,
+                        const int imgWidth)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        int w = index % _layerWidth;
+        int h = index / _layerWidth;
+        __global Dtype* outputPtr;
+        int aspect_count = (_maxSize > 0) ? 1 : 0;
+        outputPtr = dst + index * 4 * offsetsX_size * (1 + aspect_count + aspectRatios_size);
+
+        Dtype _boxWidth, _boxHeight;
+        Dtype4 vec;
+        _boxWidth = _boxHeight = _minSize * scales[0];
+        for (int i = 0; i < offsetsX_size; ++i)
+        {
+            float center_x = (w + _offsetsX[i]) * stepX;
+            float center_y = (h + _offsetsY[i]) * stepY;
+
+            vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;    // xmin
+            vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;  // ymin
+            vec.z = (center_x + _boxWidth * 0.5f) / imgWidth;    // xmax
+            vec.w = (center_y + _boxHeight * 0.5f) / imgHeight;  // ymax
+            vstore4(vec, 0, outputPtr);
+
+            outputPtr += 4;
+        }
+
+        if (_maxSize > 0)
+        {
+            _boxWidth = _boxHeight = native_sqrt(_minSize * _maxSize) * scales[1];
+
+            for (int i = 0; i < offsetsX_size; ++i)
+            {
+                float center_x = (w + _offsetsX[i]) * stepX;
+                float center_y = (h + _offsetsY[i]) * stepY;
+
+                vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;    // xmin
+                vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;  // ymin
+                vec.z = (center_x + _boxWidth * 0.5f) / imgWidth;    // xmax
+                vec.w = (center_y + _boxHeight * 0.5f) / imgHeight;  // ymax
+                vstore4(vec, 0, outputPtr);
+
+                outputPtr += 4;
+            }
+        }
+
+        for (int r = 0; r < aspectRatios_size; ++r)
+        {
+            float ar = native_sqrt(_aspectRatios[r]);
+            float scale = scales[(_maxSize > 0 ? 2 : 1) + r];
+
+            _boxWidth = _minSize * ar * scale;
+            _boxHeight = _minSize / ar * scale;
+
+            for (int i = 0; i < offsetsX_size; ++i)
+            {
+                float center_x = (w + _offsetsX[i]) * stepX;
+                float center_y = (h + _offsetsY[i]) * stepY;
+
+                vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;    // xmin
+                vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;  // ymin
+                vec.z = (center_x + _boxWidth * 0.5f) / imgWidth;    // xmax
+                vec.w = (center_y + _boxHeight * 0.5f) / imgHeight;  // ymax
+                vstore4(vec, 0, outputPtr);
+
+                outputPtr += 4;
+            }
+        }
+    }
+}
+
+__kernel void set_variance(const int nthreads,
+                           const int offset,
+                           const int variance_size,
+                           __global const Dtype* variance,
+                           __global Dtype* dst)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype4 var_vec;
+
+        if (variance_size == 1)
+            var_vec = (Dtype4)(variance[0]);
+        else
+            var_vec = vload4(0, variance);
+
+        vstore4(var_vec, 0, dst + offset + index * 4);
+    }
+}