1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
5 // Copyright (C) 2016, Intel Corporation, all rights reserved.
6 // Third party copyrights are property of their respective owners.
9 Implementation of Batch Normalization layer.
12 #include "../precomp.hpp"
13 #include "layers_common.hpp"
14 #include "../op_halide.hpp"
15 #include "../op_inf_engine.hpp"
16 #include <opencv2/dnn/shape_utils.hpp>
19 #include "opencl_kernels_dnn.hpp"
27 class BatchNormLayerImpl CV_FINAL : public BatchNormLayer
31 UMat umat_weight, umat_bias;
35 BatchNormLayerImpl(const LayerParams& params)
38 setParamsFrom(params);
39 CV_Assert(blobs.size() >= 2);
41 hasWeights = params.get<bool>("has_weight", false);
42 hasBias = params.get<bool>("has_bias", false);
43 useGlobalStats = params.get<bool>("use_global_stats", true);
44 if(params.get<bool>("scale_bias", false))
45 hasWeights = hasBias = true;
46 epsilon = params.get<float>("eps", 1E-5);
48 size_t n = blobs[0].total();
49 CV_Assert(blobs[1].total() == n &&
50 blobs[0].isContinuous() && blobs[1].isContinuous() &&
51 blobs[0].type() == CV_32F && blobs[1].type() == CV_32F);
53 float varMeanScale = 1.f;
54 if (!hasWeights && !hasBias && blobs.size() > 2 && useGlobalStats) {
55 CV_Assert(blobs.size() == 3); CV_CheckTypeEQ(blobs[2].type(), CV_32FC1, "");
56 varMeanScale = blobs[2].at<float>(0);
57 if (varMeanScale != 0)
58 varMeanScale = 1/varMeanScale;
61 const int biasBlobIndex = blobs.size() - 1;
62 const int weightsBlobIndex = biasBlobIndex - hasBias;
66 CV_Assert((size_t)weightsBlobIndex < blobs.size());
67 const Mat& w = blobs[weightsBlobIndex];
68 CV_Assert(w.isContinuous() && w.type() == CV_32F && w.total() == (size_t)n);
73 CV_Assert((size_t)biasBlobIndex < blobs.size());
74 const Mat& b = blobs[weightsBlobIndex];
75 CV_Assert(b.isContinuous() && b.type() == CV_32F && b.total() == (size_t)n);
78 const float* meanData = blobs[0].ptr<float>();
79 const float* stdData = blobs[1].ptr<float>();
80 const float* weightsData = hasWeights ? blobs[weightsBlobIndex].ptr<float>() : 0;
81 const float* biasData = hasBias ? blobs[biasBlobIndex].ptr<float>() : 0;
83 weights_.create(1, (int)n, CV_32F);
84 bias_.create(1, (int)n, CV_32F);
86 float* dstWeightsData = weights_.ptr<float>();
87 float* dstBiasData = bias_.ptr<float>();
89 for (size_t i = 0; i < n; ++i)
91 float w = (hasWeights ? weightsData[i] : 1.0f) / sqrt(stdData[i] * varMeanScale + epsilon);
92 dstWeightsData[i] = w;
93 dstBiasData[i] = (hasBias ? biasData[i] : 0.0f) - w * meanData[i] * varMeanScale;
97 void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
103 virtual bool tryFuse(Ptr<Layer>& top) CV_OVERRIDE
106 top->getScaleShift(w, b);
107 if (w.empty() && b.empty())
110 const int numChannels = weights_.total();
111 const int numFusedWeights = w.total();
112 const int numFusedBias = b.total();
114 if ((numFusedWeights != numChannels && numFusedWeights != 1 && !w.empty()) ||
115 (numFusedBias != numChannels && numFusedBias != 1 && !b.empty()))
121 if (numFusedWeights == 1)
123 multiply(weights_, w.at<float>(0), weights_);
124 multiply(bias_, w.at<float>(0), bias_);
128 multiply(weights_, w, weights_);
129 multiply(bias_, w, bias_);
135 if (numFusedBias == 1)
136 add(bias_, b.at<float>(0), bias_);
138 add(bias_, b.reshape(1, 1), bias_);
143 bool getMemoryShapes(const std::vector<MatShape> &inputs,
144 const int requiredOutputs,
145 std::vector<MatShape> &outputs,
146 std::vector<MatShape> &internals) const CV_OVERRIDE
148 dims = inputs[0].size();
149 if (!useGlobalStats && inputs[0][0] != 1)
150 CV_Error(Error::StsNotImplemented, "Batch normalization in training mode with batch size > 1");
151 Layer::getMemoryShapes(inputs, requiredOutputs, outputs, internals);
155 virtual bool supportBackend(int backendId) CV_OVERRIDE
157 return (backendId == DNN_BACKEND_OPENCV) ||
158 (backendId == DNN_BACKEND_HALIDE && haveHalide()) ||
159 (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && (preferableTarget == DNN_TARGET_CPU || dims == 4));
163 bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
165 std::vector<UMat> inputs;
166 std::vector<UMat> outputs;
168 bool use_half = (inputs_.depth() == CV_16S);
169 inputs_.getUMatVector(inputs);
170 outputs_.getUMatVector(outputs);
172 CV_Assert(blobs.size() >= 2);
173 CV_Assert(inputs.size() == 1);
175 if (use_half && inputs[0].dims == 2)
178 if (umat_weight.empty())
180 weights_.copyTo(umat_weight);
181 bias_.copyTo(umat_bias);
184 UMat &inpBlob = inputs[0];
185 int groups = inpBlob.size[0];
186 int channels = inpBlob.size[1];
188 for (size_t i = 2; i < inpBlob.dims; i++) {
189 planeSize *= inpBlob.size[i];
192 String opts = (use_half) ? " -DDtype=half" : " -DDtype=float";
193 for (size_t ii = 0; ii < outputs.size(); ii++)
195 if (inpBlob.dims == 2)
197 UMat& src = inputs[ii];
198 UMat& dst = outputs[ii];
199 multiply(src, weights_, dst);
200 add(dst, bias_, dst);
204 MatShape s = shape(groups * channels, planeSize);
205 UMat src = inputs[ii].reshape(1, s.size(), &s[0]);
206 UMat dst = outputs[ii].reshape(1, s.size(), &s[0]);
207 int number = (s[1] % 8 == 0) ? 8 : ((s[1] % 4 == 0) ? 4 : 1);
208 String buildopt = format("-DNUM=%d", number) + opts;
209 String kname = format("batch_norm%d", number);
211 buildopt += format(" -Dconvert_T=convert_%s", use_half ? "half" : "float");
213 buildopt += format(" -Dconvert_T=convert_%s%d", use_half ? "half" : "float", number);
214 ocl::Kernel kernel(kname.c_str(), ocl::dnn::batchnorm_oclsrc, buildopt);
217 size_t global[] = { (size_t)s[0], (size_t)(s[1] / number) };
218 kernel.set(0, ocl::KernelArg::PtrReadOnly(src));
219 kernel.set(1, (int)s[0]);
220 kernel.set(2, (int)s[1]);
221 kernel.set(3, (int)channels);
222 kernel.set(4, ocl::KernelArg::PtrReadOnly(umat_weight));
223 kernel.set(5, ocl::KernelArg::PtrReadOnly(umat_bias));
224 kernel.set(6, ocl::KernelArg::PtrWriteOnly(dst));
225 bool ret = kernel.run(2, global, NULL, false);
234 void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
237 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
239 CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
240 forward_ocl(inputs_arr, outputs_arr, internals_arr))
242 if (inputs_arr.depth() == CV_16S)
244 forward_fallback(inputs_arr, outputs_arr, internals_arr);
248 std::vector<Mat> inputs, outputs;
249 inputs_arr.getMatVector(inputs);
250 outputs_arr.getMatVector(outputs);
252 CV_Assert(blobs.size() >= 2);
253 CV_Assert(inputs.size() == 1);
255 Mat &inpBlob = inputs[0];
257 for (size_t i = 2; i < inpBlob.dims; i++) {
258 planeSize *= inpBlob.size[i];
261 for (size_t ii = 0; ii < outputs.size(); ii++)
263 Mat &outBlob = outputs[ii];
265 for(int num = 0; num < outBlob.size[0]; num++)
267 for (int n = 0; n < outBlob.size[1]; n++)
269 float w = weights_.at<float>(n);
270 float b = bias_.at<float>(n);
271 Mat inpBlobPlane(1, planeSize, CV_32F, inpBlob.ptr<float>(num, n));
272 Mat outBlobPlane(1, planeSize, CV_32F, outBlob.ptr<float>(num, n));
273 inpBlobPlane.convertTo(outBlobPlane, CV_32F, w, b);
279 void forwardSlice(const float* srcptr, float* dstptr, int len, size_t planeSize, int cn0, int cn1) const CV_OVERRIDE
281 for( int cn = cn0; cn < cn1; cn++, srcptr += planeSize, dstptr += planeSize )
284 float w = weights_.at<float>(cn);
285 float b = bias_.at<float>(cn);
287 v_float32x4 wV = v_setall_f32(w), bV = v_setall_f32(b);
288 for( ; i <= len - 16; i += 16 )
290 v_float32x4 x0 = v_load(srcptr + i);
291 v_float32x4 x1 = v_load(srcptr + i + 4);
292 v_float32x4 x2 = v_load(srcptr + i + 8);
293 v_float32x4 x3 = v_load(srcptr + i + 12);
294 x0 = v_muladd(x0, wV, bV);
295 x1 = v_muladd(x1, wV, bV);
296 x2 = v_muladd(x2, wV, bV);
297 x3 = v_muladd(x3, wV, bV);
298 v_store(dstptr + i, x0);
299 v_store(dstptr + i + 4, x1);
300 v_store(dstptr + i + 8, x2);
301 v_store(dstptr + i + 12, x3);
304 for( ; i < len; i++ )
305 dstptr[i] = w * srcptr[i] + b;
309 virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
311 switch (node->backendId)
313 case DNN_BACKEND_HALIDE:
316 auto base = node.dynamicCast<HalideBackendNode>();
317 Halide::Func& input = base->funcs.back();
318 Halide::Var x("x"), y("y"), c("c"), n("n");
319 Halide::Func top = attachHalide(input(x, y, c, n));
320 return Ptr<BackendNode>(new HalideBackendNode(base, top));
321 #endif // HAVE_HALIDE
325 return Ptr<BackendNode>();
328 virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
331 Halide::Buffer<float> input = halideBuffer(inputs[0]);
332 Halide::Var x("x"), y("y"), c("c"), n("n");
333 Halide::Func top = attachHalide(input(x, y, c, n));
334 return Ptr<BackendNode>(new HalideBackendNode(top));
335 #endif // HAVE_HALIDE
336 return Ptr<BackendNode>();
340 // attachHalide can work both with Halide::Buffer and Halide::Func. In the
341 // second case it will be a fusion.
342 Halide::Func attachHalide(const Halide::Expr& input)
344 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
345 Halide::Var x("x"), y("y"), c("c"), n("n");
347 const int numChannels = weights_.total();
348 auto weights = wrapToHalideBuffer(weights_, {numChannels});
349 auto bias = wrapToHalideBuffer(bias_, {numChannels});
350 top(x, y, c, n) = input * weights(c) + bias(c);
353 #endif // HAVE_HALIDE
355 #ifdef HAVE_INF_ENGINE
356 virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
358 InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
359 const size_t numChannels = weights_.total();
360 addConstantData("weights", wrapToInfEngineBlob(weights_, {numChannels}, InferenceEngine::Layout::C), ieLayer);
361 addConstantData("biases", wrapToInfEngineBlob(bias_, {numChannels}, InferenceEngine::Layout::C), ieLayer);
362 return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
364 #endif // HAVE_INF_ENGINE
366 virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
367 const std::vector<MatShape> &outputs) const CV_OVERRIDE
369 CV_UNUSED(outputs); // suppress unused variable warning
372 for(int i = 0; i < inputs.size(); i++)
374 flops += 3*total(inputs[i]);
383 Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
385 return Ptr<BatchNormLayer>(new BatchNormLayerImpl(params));