From 2863f950d634ed456506362669355aa202df8508 Mon Sep 17 00:00:00 2001
From: Li Peng <peng.li@intel.com>
Date: Fri, 16 Feb 2018 15:37:59 +0800
Subject: [PATCH] ReLU6 layer ocl support

include relu6 ocl kernel and layer fusion support

Signed-off-by: Li Peng <peng.li@intel.com>
---
 modules/dnn/include/opencv2/dnn/all_layers.hpp     |  2 ++
 modules/dnn/src/dnn.cpp                            |  1 +
 modules/dnn/src/layers/convolution_layer.cpp       | 14 +++++++++++
 modules/dnn/src/layers/elementwise_layers.cpp      | 29 ++++++++++++++++++++--
 modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp        |  6 ++++-
 .../dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp   | 22 ++++++++++++++++
 modules/dnn/src/opencl/activations.cl              | 11 ++++++++
 modules/dnn/src/opencl/conv_layer_spatial.cl       | 21 +++++++++-------
 8 files changed, 94 insertions(+), 12 deletions(-)
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 4219108..34f5616 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -406,6 +406,8 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
     class CV_EXPORTS ReLU6Layer : public ActivationLayer
     {
     public:
+        float minValue, maxValue;
+
         static Ptr<ReLU6Layer> create(const LayerParams &params);
     };
 
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 31ae173..4b2b0a0 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -1439,6 +1439,7 @@ struct Net::Impl
                          nextData &&
                         ((nextData->type == "ReLU") ||
                          (nextData->type == "ChannelsPReLU") ||
+                         (nextData->type == "ReLU6") ||
                          (nextData->type == "TanH") ||
                          (nextData->type == "Power"))) )
                 {
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 51a28d9..2517499 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -860,6 +860,15 @@ public:
                 activType = OCL4DNN_CONV_FUSED_ACTIV_RELU;
             }
 
+            Ptr<ReLU6Layer> activ_relu6 = activ.dynamicCast<ReLU6Layer>();
+            if( !activ_relu6.empty() )
+            {
+                reluslope.resize(2);
+                reluslope[0] = activ_relu6->minValue;
+                reluslope[1] = activ_relu6->maxValue;
+                activType = OCL4DNN_CONV_FUSED_ACTIV_RELU6;
+            }
+
             Ptr<ChannelsPReLULayer> activ_chprelu = activ.dynamicCast<ChannelsPReLULayer>();
             if( !activ_chprelu.empty() )
             {
@@ -906,12 +915,17 @@ public:
             {
                 convolutionOp->setActivTanh(true);
             }
+            else if ( activType == OCL4DNN_CONV_FUSED_ACTIV_RELU6)
+            {
+                convolutionOp->setActivReLU6(true, reluslope[0], reluslope[1]);
+            }
             else
             {
                 convolutionOp->setActivReLU(false, 0);
                 convolutionOp->setActivPReLU(false, reluslope);
                 convolutionOp->setActivPower(false, 1.f);
                 convolutionOp->setActivTanh(false);
+                convolutionOp->setActivReLU6(false, 0, 0);
             }
             newActiv = false;
         }
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index dbbe6ca..adf51c8 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -381,8 +381,30 @@ struct ReLU6Functor
 #ifdef HAVE_OPENCL
     bool applyOCL(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
     {
-        // TODO: implement OCL version
-        return false;
+        std::vector<UMat> inputs;
+        std::vector<UMat> outputs;
+
+        inps.getUMatVector(inputs);
+        outs.getUMatVector(outputs);
+        String buildopt = oclGetTMacro(inputs[0]);
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            UMat& src = inputs[i];
+            UMat& dst = outputs[i];
+
+            ocl::Kernel kernel("ReLU6Forward", ocl::dnn::activations_oclsrc, buildopt);
+            kernel.set(0, (int)src.total());
+            kernel.set(1, ocl::KernelArg::PtrReadOnly(src));
+            kernel.set(2, ocl::KernelArg::PtrWriteOnly(dst));
+            kernel.set(3, (float)minValue);
+            kernel.set(4, (float)maxValue);
+
+            size_t gSize = src.total();
+            CV_Assert(kernel.run(1, &gSize, NULL, false));
+        }
+
+        return true;
     }
 #endif
 
@@ -867,6 +889,9 @@ Ptr<ReLU6Layer> ReLU6Layer::create(const LayerParams& params)
     float maxValue = params.get<float>("max_value", 6.0f);
     Ptr<ReLU6Layer> l(new ElementWiseLayer<ReLU6Functor>(ReLU6Functor(minValue, maxValue)));
     l->setParamsFrom(params);
+    l->minValue = minValue;
+    l->maxValue = maxValue;
+
     return l;
 }
 
diff --git a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
index b536ce4..93ac1a2 100644
--- a/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+++ b/modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
@@ -78,7 +78,8 @@ typedef enum {
     OCL4DNN_CONV_FUSED_ACTIV_RELU                 = 1,
     OCL4DNN_CONV_FUSED_ACTIV_PRELU                = 2,
     OCL4DNN_CONV_FUSED_ACTIV_POWER                = 3,
-    OCL4DNN_CONV_FUSED_ACTIV_TANH                 = 4
+    OCL4DNN_CONV_FUSED_ACTIV_TANH                 = 4,
+    OCL4DNN_CONV_FUSED_ACTIV_RELU6                = 5
 } ocl4dnnFusedActiv_t;
 
 template<typename Dtype>
@@ -96,6 +97,7 @@ class OCL4DNNConvSpatial
         void setActivPReLU(bool fuse_activ, std::vector<float> &slope);
         void setActivPower(bool fuse_activ, float power);
         void setActivTanh(bool fuse_activ);
+        void setActivReLU6(bool fuse_activ, float min, float max);
         void setBias(bool bias_term);
 
     private:
@@ -319,6 +321,8 @@ class OCL4DNNConvSpatial
         cv::ocl::ProgramSource src_;
         int32_t prev_kernel_type_;
         float negative_slope_;
+        float min_value_;
+        float max_value_;
         UMat negative_slope_umat_;
         ocl4dnnFusedActiv_t fused_activ_;
         float power_;
diff --git a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
index 16bea92..84ea191 100644
--- a/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+++ b/modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
@@ -82,6 +82,8 @@ OCL4DNNConvSpatial<Dtype>::OCL4DNNConvSpatial(OCL4DNNConvConfig config)
     fused_eltwise_ = false;
     power_ = 1.f;
     negative_slope_ = 0;
+    min_value_ = 0;
+    max_value_ = 0;
     prev_kernel_type_ = -1;
     tuned_ = false;
 
@@ -162,6 +164,9 @@ void OCL4DNNConvSpatial<Dtype>::setFusionDefine(ocl4dnnFusedActiv_t fused_activ,
         case OCL4DNN_CONV_FUSED_ACTIV_TANH:
             addDef("FUSED_CONV_TANH", 1);
             break;
+        case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
+            addDef("FUSED_CONV_RELU6", 1);
+            break;
         default:
             ;
     }
@@ -184,6 +189,10 @@ void OCL4DNNConvSpatial<Dtype>::setFusionArg(ocl4dnnFusedActiv_t fused_activ, bo
         case OCL4DNN_CONV_FUSED_ACTIV_POWER:
             kernel.set(argIdx++, (float)power_);
             break;
+        case OCL4DNN_CONV_FUSED_ACTIV_RELU6:
+            kernel.set(argIdx++, (float)min_value_);
+            kernel.set(argIdx++, (float)max_value_);
+            break;
         default:
             ;
     }
@@ -394,6 +403,19 @@ void OCL4DNNConvSpatial<Dtype>::setActivReLU(bool fuse_activ, float slope)
 }
 
 template<typename Dtype>
+void OCL4DNNConvSpatial<Dtype>::setActivReLU6(bool fuse_activ, float min, float max)
+{
+    if ( fuse_activ )
+    {
+        fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_RELU6;
+        min_value_ = min;
+        max_value_ = max;
+    }
+    else
+        fused_activ_ = OCL4DNN_CONV_FUSED_ACTIV_NONE;
+}
+
+template<typename Dtype>
 void OCL4DNNConvSpatial<Dtype>::setActivPReLU(bool fuse_activ, std::vector<float> &slope)
 {
     if ( fuse_activ )
diff --git a/modules/dnn/src/opencl/activations.cl b/modules/dnn/src/opencl/activations.cl
index df58a51..ab2532e 100644
--- a/modules/dnn/src/opencl/activations.cl
+++ b/modules/dnn/src/opencl/activations.cl
@@ -54,6 +54,17 @@ __kernel void ReLUForward(const int count, __global const T* in, __global T* out
 #endif
 }
 
+__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
+                           const T minValue, const T maxValue)
+{
+  int index = get_global_id(0);
+  if(index < count)
+  {
+    T x = in[index];
+    out[index] = clamp(x, minValue, maxValue);
+  }
+}
+
 __kernel void PReLUForward(const int count, const int channels, const int plane_size,
                            __global const T* in, __global T* out, __global const T* slope_data)
 {
diff --git a/modules/dnn/src/opencl/conv_layer_spatial.cl b/modules/dnn/src/opencl/conv_layer_spatial.cl
index e31d173..8f6e5a3 100644
--- a/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/modules/dnn/src/opencl/conv_layer_spatial.cl
@@ -48,19 +48,22 @@
 
 #if defined(FUSED_CONV_RELU)
 #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope)))
-#define NEGATIVE_SLOPE_ARG Dtype negative_slope,
+#define FUSED_ARG Dtype negative_slope,
 #elif defined(FUSED_CONV_PRELU)
 #define ACTIVATION_RELU_FUNCTION(x, c) ((Dtype)(x) > 0 ? (Dtype)(x) : ((Dtype)(x) * (Dtype)(negative_slope[c])))
-#define NEGATIVE_SLOPE_ARG __global const Dtype *negative_slope,
+#define FUSED_ARG __global const Dtype *negative_slope,
 #elif defined(FUSED_CONV_POWER)
 #define ACTIVATION_RELU_FUNCTION(x, c) pow(x, power)
-#define NEGATIVE_SLOPE_ARG Dtype power,
+#define FUSED_ARG Dtype power,
 #elif defined(FUSED_CONV_TANH)
 #define ACTIVATION_RELU_FUNCTION(x, c) tanh(x)
-#define NEGATIVE_SLOPE_ARG
+#define FUSED_ARG
+#elif defined(FUSED_CONV_RELU6)
+#define ACTIVATION_RELU_FUNCTION(x, c) (clamp((Dtype)(x), min_value, max_value))
+#define FUSED_ARG Dtype min_value, Dtype max_value,
 #else
 #define ACTIVATION_RELU_FUNCTION(x, c) (x)
-#define NEGATIVE_SLOPE_ARG
+#define FUSED_ARG
 #endif
 
 #ifdef FUSED_CONV_ELTWISE
@@ -108,7 +111,7 @@
 
 __kernel void ConvolveBasic(
     ELTWISE_DATA_ARG
-    NEGATIVE_SLOPE_ARG
+    FUSED_ARG
     __global Dtype* image_data,
     int image_offset,
     __global Dtype* kernel_data,
@@ -197,7 +200,7 @@ __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
 __kernel void
 convolve_simd(
     ELTWISE_DATA_ARG
-    NEGATIVE_SLOPE_ARG
+    FUSED_ARG
     __global Dtype* inputs_base,
     filter_qualifier Dtype* weights_base,
     BIAS_KERNEL_ARG
@@ -417,7 +420,7 @@ typedef struct float0 { float s0; } float0; //never used but makes compiler happ
 
 #define GEMM_LIKE_KERNEL_ARGS     \
     ELTWISE_DATA_ARG              \
-    NEGATIVE_SLOPE_ARG            \
+    FUSED_ARG                     \
     const __global Dtype *src0,   \
     const __global Dtype *src1,   \
     BIAS_KERNEL_ARG               \
@@ -1731,7 +1734,7 @@ __kernel void Conv_Interleaved(GEMM_LIKE_KERNEL_ARGS)
 
 __kernel void DWCONV(
     ELTWISE_DATA_ARG
-    NEGATIVE_SLOPE_ARG
+    FUSED_ARG
     __global Dtype* image_data,
     __global Dtype* kernel_data,
     BIAS_KERNEL_ARG
-- 
2.7.4