modules/dnn/src/layers/batch_norm_layer.cpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html.
   4
   5 // Copyright (C) 2016, Intel Corporation, all rights reserved.
   6 // Third party copyrights are property of their respective owners.
   7
   8 /*
   9 Implementation of Batch Normalization layer.
  10 */
  11
  12 #include "../precomp.hpp"
  13 #include "layers_common.hpp"
  14 #include "../op_halide.hpp"
  15 #include "../op_inf_engine.hpp"
  16 #include <opencv2/dnn/shape_utils.hpp>
  17
  18 #ifdef HAVE_OPENCL
  19 #include "opencl_kernels_dnn.hpp"
  20 #endif
  21
  22 namespace cv
  23 {
  24 namespace dnn
  25 {
  26
  27 class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
  28 {
  29 public:
  30     Mat weights_, bias_;
  31     UMat umat_weight, umat_bias;
  32     mutable int dims;
  33
  34
  35     BatchNormLayerImpl(const LayerParams& params)
  36         : dims(-1)
  37     {
  38         setParamsFrom(params);
  39         CV_Assert(blobs.size() >= 2);
  40
  41         hasWeights = params.get<bool>("has_weight", false);
  42         hasBias = params.get<bool>("has_bias", false);
  43         useGlobalStats = params.get<bool>("use_global_stats", true);
  44         if(params.get<bool>("scale_bias", false))
  45             hasWeights = hasBias = true;
  46         epsilon = params.get<float>("eps", 1E-5);
  47
  48         size_t n = blobs[0].total();
  49         CV_Assert(blobs[1].total() == n &&
  50                   blobs[0].isContinuous() && blobs[1].isContinuous() &&
  51                   blobs[0].type() == CV_32F && blobs[1].type() == CV_32F);
  52
  53         float varMeanScale = 1.f;
  54         if (!hasWeights && !hasBias && blobs.size() > 2 && useGlobalStats) {
  55             CV_Assert(blobs.size() == 3); CV_CheckTypeEQ(blobs[2].type(), CV_32FC1, "");
  56             varMeanScale = blobs[2].at<float>(0);
  57             if (varMeanScale != 0)
  58                 varMeanScale = 1/varMeanScale;
  59         }
  60
  61         const int biasBlobIndex = blobs.size() - 1;
  62         const int weightsBlobIndex = biasBlobIndex - hasBias;
  63
  64         if( hasWeights )
  65         {
  66             CV_Assert((size_t)weightsBlobIndex < blobs.size());
  67             const Mat& w = blobs[weightsBlobIndex];
  68             CV_Assert(w.isContinuous() && w.type() == CV_32F && w.total() == (size_t)n);
  69         }
  70
  71         if( hasBias )
  72         {
  73             CV_Assert((size_t)biasBlobIndex < blobs.size());
  74             const Mat& b = blobs[weightsBlobIndex];
  75             CV_Assert(b.isContinuous() && b.type() == CV_32F && b.total() == (size_t)n);
  76         }
  77
  78         const float* meanData = blobs[0].ptr<float>();
  79         const float* stdData = blobs[1].ptr<float>();
  80         const float* weightsData = hasWeights ? blobs[weightsBlobIndex].ptr<float>() : 0;
  81         const float* biasData = hasBias ? blobs[biasBlobIndex].ptr<float>() : 0;
  82
  83         weights_.create(1, (int)n, CV_32F);
  84         bias_.create(1, (int)n, CV_32F);
  85
  86         float* dstWeightsData = weights_.ptr<float>();
  87         float* dstBiasData = bias_.ptr<float>();
  88
  89         for (size_t i = 0; i < n; ++i)
  90         {
  91             float w = (hasWeights ? weightsData[i] : 1.0f) / sqrt(stdData[i] * varMeanScale + epsilon);
  92             dstWeightsData[i] = w;
  93             dstBiasData[i] = (hasBias ? biasData[i] : 0.0f) - w * meanData[i] * varMeanScale;
  94         }
  95     }
  96
  97     void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
  98     {
  99         scale = weights_;
 100         shift = bias_;
 101     }
 102
 103     virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
 104     {
 105         Mat w, b;
 106         top->getScaleShift(w, b);
 107         if (w.empty() && b.empty())
 108             return false;
 109
 110         const int numChannels = weights_.total();
 111         const int numFusedWeights = w.total();
 112         const int numFusedBias = b.total();
 113
 114         if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w.empty()) ||
 115             (numFusedBias != numChannels && numFusedBias != 1 && !b.empty()))
 116             return false;
 117
 118         if (!w.empty())
 119         {
 120             w = w.reshape(1, 1);
 121             if (numFusedWeights == 1)
 122             {
 123                 multiply(weights_, w.at<float>(0), weights_);
 124                 multiply(bias_, w.at<float>(0), bias_);
 125             }
 126             else
 127             {
 128                 multiply(weights_, w, weights_);
 129                 multiply(bias_, w, bias_);
 130             }
 131         }
 132         if (!b.empty())
 133         {
 134             b = b.reshape(1, 1);
 135             if (numFusedBias == 1)
 136                 add(bias_, b.at<float>(0), bias_);
 137             else
 138                 add(bias_, b.reshape(1, 1), bias_);
 139         }
 140         return true;
 141     }
 142
 143     bool getMemoryShapes(const std::vector<MatShape> &inputs,
 144                          const int requiredOutputs,
 145                          std::vector<MatShape> &outputs,
 146                          std::vector<MatShape> &internals) const CV_OVERRIDE
 147     {
 148         dims = inputs[0].size();
 149         if (!useGlobalStats && inputs[0][0] != 1)
 150             CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1");
 151         Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
 152         return true;
 153     }
 154
 155     virtual bool supportBackend(int backendId) CV_OVERRIDE
 156     {
 157         return (backendId == DNN_BACKEND_OPENCV) ||
 158                (backendId == DNN_BACKEND_HALIDE && haveHalide()) ||
 159                (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && (preferableTarget == DNN_TARGET_CPU || dims == 4));
 160     }
 161
 162 #ifdef HAVE_OPENCL
 163     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
 164     {
 165         std::vector<UMat> inputs;
 166         std::vector<UMat> outputs;
 167
 168         bool use_half = (inputs_.depth() == CV_16S);
 169         inputs_.getUMatVector(inputs);
 170         outputs_.getUMatVector(outputs);
 171
 172         CV_Assert(blobs.size() >= 2);
 173         CV_Assert(inputs.size() == 1);
 174
 175         if (use_half && inputs[0].dims == 2)
 176             return false;
 177
 178         if (umat_weight.empty())
 179         {
 180             weights_.copyTo(umat_weight);
 181             bias_.copyTo(umat_bias);
 182         }
 183
 184         UMat &inpBlob = inputs[0];
 185         int groups = inpBlob.size[0];
 186         int channels = inpBlob.size[1];
 187         int planeSize = 1;
 188         for (size_t i = 2; i < inpBlob.dims; i++) {
 189             planeSize *= inpBlob.size[i];
 190         }
 191
 192         String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
 193         for (size_t ii = 0; ii < outputs.size(); ii++)
 194         {
 195             if (inpBlob.dims == 2)
 196             {
 197                 UMat& src = inputs[ii];
 198                 UMat& dst = outputs[ii];
 199                 multiply(src, weights_, dst);
 200                 add(dst, bias_, dst);
 201             }
 202             else
 203             {
 204                 MatShape s = shape(groups * channels, planeSize);
 205                 UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
 206                 UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
 207                 int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
 208                 String buildopt = format("-DNUM=%d", number) + opts;
 209                 String kname = format("batch_norm%d", number);
 210                 if (number == 1)
 211                     buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
 212                 else
 213                     buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
 214                 ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
 215                 if (kernel.empty())
 216                     return false;
 217                 size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
 218                 kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
 219                 kernel.set(1, (int)s[0]);
 220                 kernel.set(2, (int)s[1]);
 221                 kernel.set(3, (int)channels);
 222                 kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
 223                 kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
 224                 kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
 225                 bool ret = kernel.run(2, global, NULL, false);
 226                 if (!ret)
 227                     return false;
 228             }
 229         }
 230         return true;
 231     }
 232 #endif
 233
 234     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
 235     {
 236         CV_TRACE_FUNCTION();
 237         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 238
 239         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
 240                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 241
 242         if (inputs_arr.depth() == CV_16S)
 243         {
 244             forward_fallback(inputs_arr, outputs_arr, internals_arr);
 245             return;
 246         }
 247
 248         std::vector<Mat> inputs, outputs;
 249         inputs_arr.getMatVector(inputs);
 250         outputs_arr.getMatVector(outputs);
 251
 252         CV_Assert(blobs.size() >= 2);
 253         CV_Assert(inputs.size() == 1);
 254
 255         Mat &inpBlob = inputs[0];
 256         int planeSize = 1;
 257         for (size_t i = 2; i < inpBlob.dims; i++) {
 258             planeSize *= inpBlob.size[i];
 259         }
 260
 261         for (size_t ii = 0; ii < outputs.size(); ii++)
 262         {
 263             Mat &outBlob = outputs[ii];
 264
 265             for(int num = 0; num < outBlob.size[0]; num++)
 266             {
 267                 for (int n = 0; n < outBlob.size[1]; n++)
 268                 {
 269                     float w = weights_.at<float>(n);
 270                     float b = bias_.at<float>(n);
 271                     Mat inpBlobPlane(1, planeSize, CV_32F, inpBlob.ptr<float>(num, n));
 272                     Mat outBlobPlane(1, planeSize, CV_32F, outBlob.ptr<float>(num, n));
 273                     inpBlobPlane.convertTo(outBlobPlane, CV_32F, w, b);
 274                 }
 275             }
 276         }
 277     }
 278
 279     void forwardSlice(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
 280     {
 281         for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
 282         {
 283             int i = 0;
 284             float w = weights_.at<float>(cn);
 285             float b = bias_.at<float>(cn);
 286 #if CV_SIMD128
 287             v_float32x4 wV = v_setall_f32(w), bV = v_setall_f32(b);
 288             for( ; i <= len - 16; i += 16 )
 289             {
 290                 v_float32x4 x0 = v_load(srcptr + i);
 291                 v_float32x4 x1 = v_load(srcptr + i + 4);
 292                 v_float32x4 x2 = v_load(srcptr + i + 8);
 293                 v_float32x4 x3 = v_load(srcptr + i + 12);
 294                 x0 = v_muladd(x0, wV, bV);
 295                 x1 = v_muladd(x1, wV, bV);
 296                 x2 = v_muladd(x2, wV, bV);
 297                 x3 = v_muladd(x3, wV, bV);
 298                 v_store(dstptr + i, x0);
 299                 v_store(dstptr + i + 4, x1);
 300                 v_store(dstptr + i + 8, x2);
 301                 v_store(dstptr + i + 12, x3);
 302             }
 303 #endif
 304             for( ; i < len; i++ )
 305                 dstptr[i] = w * srcptr[i] + b;
 306         }
 307     }
 308
 309     virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
 310     {
 311         switch (node->backendId)
 312         {
 313             case DNN_BACKEND_HALIDE:
 314             {
 315 #ifdef HAVE_HALIDE
 316                 auto base = node.dynamicCast<HalideBackendNode>();
 317                 Halide::Func& input = base->funcs.back();
 318                 Halide::Var x("x"), y("y"), c("c"), n("n");
 319                 Halide::Func top = attachHalide(input(x, y, c, n));
 320                 return Ptr<BackendNode>(new HalideBackendNode(base, top));
 321 #endif  // HAVE_HALIDE
 322                 break;
 323             }
 324         }
 325         return Ptr<BackendNode>();
 326     }
 327
 328     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
 329     {
 330 #ifdef HAVE_HALIDE
 331         Halide::Buffer<float> input = halideBuffer(inputs[0]);
 332         Halide::Var x("x"), y("y"), c("c"), n("n");
 333         Halide::Func top = attachHalide(input(x, y, c, n));
 334         return Ptr<BackendNode>(new HalideBackendNode(top));
 335 #endif  // HAVE_HALIDE
 336         return Ptr<BackendNode>();
 337     }
 338
 339 #ifdef HAVE_HALIDE
 340     // attachHalide can work both with Halide::Buffer and Halide::Func. In the
 341     // second case it will be a fusion.
 342     Halide::Func attachHalide(const Halide::Expr& input)
 343     {
 344         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
 345         Halide::Var x("x"), y("y"), c("c"), n("n");
 346
 347         const int numChannels = weights_.total();
 348         auto weights = wrapToHalideBuffer(weights_, {numChannels});
 349         auto bias = wrapToHalideBuffer(bias_, {numChannels});
 350         top(x, y, c, n) = input * weights(c) + bias(c);
 351         return top;
 352     }
 353 #endif  // HAVE_HALIDE
 354
 355 #ifdef HAVE_INF_ENGINE
 356     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
 357     {
 358         InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
 359         const size_t numChannels = weights_.total();
 360         addConstantData("weights", wrapToInfEngineBlob(weights_, {numChannels}, InferenceEngine::Layout::C), ieLayer);
 361         addConstantData("biases", wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C), ieLayer);
 362         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 363     }
 364 #endif  // HAVE_INF_ENGINE
 365
 366     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
 367                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
 368     {
 369         CV_UNUSED(outputs); // suppress unused variable warning
 370
 371         int64 flops = 0;
 372         for(int i = 0; i < inputs.size(); i++)
 373         {
 374             flops += 3*total(inputs[i]);
 375         }
 376         return flops;
 377     }
 378
 379 private:
 380     bool useGlobalStats;
 381 };
 382
 383 Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
 384 {
 385     return Ptr<BatchNormLayer>(new BatchNormLayerImpl(params));
 386 }
 387
 388 }  // namespace dnn
 389 }  // namespace cv