c9baf79d0035a3879913994c9d5ad354bf1f8f66
[platform/upstream/opencv.git] / modules / dnn / src / layers / fully_connected_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_halide.hpp"
46 #include "../op_inf_engine.hpp"
47 #include <opencv2/dnn/shape_utils.hpp>
48
49 #ifdef HAVE_OPENCL
50 #include "opencl_kernels_dnn.hpp"
51 using namespace cv::dnn::ocl4dnn;
52 #endif
53
54 namespace cv
55 {
56 namespace dnn
57 {
58
59 class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
60 {
61 public:
62     enum { VEC_ALIGN = 8 };
63
64 #ifdef HAVE_OPENCL
65     Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
66     std::vector<UMat> umat_blobs;
67     std::vector<UMat> half_blobs;
68 #endif
69
70     FullyConnectedLayerImpl(const LayerParams& params)
71     {
72         setParamsFrom(params);
73         CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
74
75         int numOutput = params.get<int>("num_output");
76         int innerSize = (int)blobs[0].total() / numOutput;
77         bias = params.get<bool>("bias_term", true);
78         axis = params.get<int>("axis", 1);
79
80         CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
81         CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
82
83         weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
84         int vecsize = weightsMat.cols;
85         if( vecsize % VEC_ALIGN != 0 )
86         {
87             int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
88             Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
89             Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
90             wpadding.setTo(Scalar::all(0.));
91             weightsMat = weightsBuf.colRange(0, vecsize);
92             blobs[0].copyTo(weightsMat);
93         }
94
95         if (bias)
96             biasMat = blobs[1] = blobs[1].reshape(1, 1);
97         else
98             biasMat = Mat::zeros(1, numOutput, weightsMat.type());
99     }
100
101     bool getMemoryShapes(const std::vector<MatShape> &inputs,
102                          const int requiredOutputs,
103                          std::vector<MatShape> &outputs,
104                          std::vector<MatShape> &) const CV_OVERRIDE
105     {
106         CV_Assert(inputs.size() == 1);
107         CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
108         CV_Assert(blobs[0].dims == 2);
109
110         int cAxis = clamp(axis, inputs[0]);
111         int numOutput = blobs[0].size[0];
112         MatShape outShape(cAxis + 1);
113         for (int i = 0; i < cAxis; ++i)
114             outShape[i] = inputs[0][i];
115         outShape.back() = numOutput;
116
117         outputs.resize(inputs.size(), outShape);
118
119         CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
120         return false;
121     }
122
123     virtual bool supportBackend(int backendId) CV_OVERRIDE
124     {
125         return backendId == DNN_BACKEND_OPENCV ||
126                (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1) ||
127                (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && axis == 1);
128     }
129
130     virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
131     {
132         if (activ.empty() || layer.empty())
133         {
134             activ = layer;
135             return !activ.empty();
136         }
137         else
138             return false;
139     }
140
141     class FullyConnected : public ParallelLoopBody
142     {
143     public:
144         FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false) {}
145
146         static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
147                         Mat& dstMat, const ActivationLayer* activ, int nstripes)
148         {
149             CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
150                        dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
151                        srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
152                        srcMat.type() == CV_32F &&
153                        (biasMat.empty() || (biasMat.type() == srcMat.type() &&
154                                            biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
155
156             FullyConnected p;
157
158             p.srcMat = &srcMat;
159             p.weights = &weights;
160             p.biasMat = &biasMat;
161             p.dstMat = &dstMat;
162             p.nstripes = nstripes;
163             p.activ = activ;
164             p.useAVX = checkHardwareSupport(CPU_AVX);
165             p.useAVX2 = checkHardwareSupport(CPU_AVX2);
166             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
167
168             parallel_for_(Range(0, nstripes), p, nstripes);
169         }
170
171         void operator()(const Range& r) const CV_OVERRIDE
172         {
173             int valign = FullyConnectedLayerImpl::VEC_ALIGN;
174             int nsamples = srcMat->rows;
175             int nw0 = weights->rows;
176             int k, vecsize = srcMat->cols;
177             int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
178             size_t total = (size_t)nsamples*nw0;
179             size_t stripeSize = (total + nstripes - 1)/nstripes;
180             size_t stripeStart = r.start*stripeSize;
181             size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
182             size_t wstep = weights->step1();
183             AutoBuffer<float> srcbuf(vecsize_aligned + valign);
184             float* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(float)));
185
186             for( k = vecsize; k < vecsize_aligned; k++ )
187                 sptr[k] = 0.f;
188
189             for( size_t ofs = stripeStart; ofs < stripeEnd; )
190             {
191                 int sampleIdx = (int)(ofs / nw0);
192                 int delta = (int)(ofs - (size_t)sampleIdx*nw0);
193                 const float* sptr_ = srcMat->ptr<float>(sampleIdx);
194                 const float* wptr = weights->ptr<float>(delta);
195                 float* dptr = dstMat->ptr<float>(sampleIdx) + delta;
196                 const float* biasptr = biasMat->ptr<float>() + delta;
197                 int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
198
199                 memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
200
201             #if CV_TRY_AVX512_SKX
202                 if( useAVX512 )
203                     opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
204                 else
205             #endif
206             #if CV_TRY_AVX2
207                 if( useAVX2 )
208                     opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
209                 else
210             #endif
211             #if CV_TRY_AVX
212                 if( useAVX )
213                     opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
214                 else
215             #endif
216                 {
217                     int i = 0;
218
219             #if CV_SIMD128
220                     for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
221                     {
222                         v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
223                         v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
224
225                         for( k = 0; k < vecsize; k += 4 )
226                         {
227                             v_float32x4 v = v_load_aligned(sptr + k);
228                             vs0 += v*v_load_aligned(wptr + k);
229                             vs1 += v*v_load_aligned(wptr + wstep + k);
230                             vs2 += v*v_load_aligned(wptr + wstep*2 + k);
231                             vs3 += v*v_load_aligned(wptr + wstep*3 + k);
232                         }
233
234                         v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
235                         s += v_load(biasptr + i);
236                         v_store(dptr + i, s);
237                     }
238             #endif
239
240                     for( ; i < nw; i++, wptr += wstep )
241                     {
242                         float s0=biasptr[i];
243
244                         for( k = 0; k < vecsize; k++ )
245                         {
246                             float v = sptr[k];
247                             s0 += v*wptr[k];
248                         }
249                         dptr[i] = s0;
250                     }
251                 }
252
253                 if(activ)
254                     activ->forwardSlice(dptr, dptr, 1, 1, delta, delta + nw);
255
256                 ofs += nw;
257             }
258         }
259
260         const Mat *srcMat, *weights, *biasMat;
261         const ActivationLayer* activ;
262         Mat* dstMat;
263         int nstripes;
264         bool useAVX;
265         bool useAVX2;
266         bool useAVX512;
267     };
268
269 #ifdef HAVE_OPENCL
270     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
271     {
272         innerProductOp.release();
273         umat_blobs.clear();
274         half_blobs.clear();
275     }
276
277     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
278     {
279         std::vector<UMat> inputs;
280         std::vector<UMat> outputs;
281
282         bool use_half = (inps.depth() == CV_16S);
283         inps.getUMatVector(inputs);
284         outs.getUMatVector(outputs);
285
286         int axisCan = clamp(axis, inputs[0].dims);
287         int numOutput = blobs[0].size[0];
288         int innerSize = blobs[0].size[1];
289         int outerSize = total(shape(inputs[0]), 0, axisCan);
290         bool ret = true;
291
292         if (innerProductOp.empty())
293         {
294             size_t n = blobs.size();
295             umat_blobs.resize(n);
296             for (int i = 0; i < n; i++) blobs[i].copyTo(umat_blobs[i]);
297
298             OCL4DNNInnerProductConfig config;
299             config.num_output = numOutput;
300             config.bias_term = bias;
301             config.M = outerSize;
302             config.K = innerSize;
303             config.use_half = use_half;
304
305             if (use_half)
306             {
307                 half_blobs.resize(umat_blobs.size());
308                 for (int i = 0; i < umat_blobs.size(); i++)
309                 {
310                     if (!umat_blobs[i].empty())
311                         convertFp16(umat_blobs[i], half_blobs[i]);
312                 }
313             }
314
315             innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
316         }
317
318         for (size_t i = 0; i < inputs.size(); i++)
319         {
320             MatShape inshape, outshape;
321             inshape = shape(outerSize, innerSize);
322             outshape = shape(outerSize, numOutput);
323
324             UMat srcMat, dstMat;
325             srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
326             dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
327
328             if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
329                                          (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
330                                          dstMat))
331             {
332                 ret = false;
333                 break;
334             }
335
336             if (!use_half && bias && (outerSize > 1))
337             {
338                 UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
339                 UMat& biases = umat_blobs[1];
340                 cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
341             }
342         }
343
344         if (ret) return true;
345
346         UMat& weights = umat_blobs[0];
347         for (size_t i = 0; i < inputs.size(); i++)
348         {
349             MatShape inshape, outshape;
350             inshape = shape(outerSize, innerSize);
351             outshape = shape(outerSize, numOutput);
352
353             UMat srcMat, dstMat, srcMat_fp32, dstMat_fp32;
354             srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
355             dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
356
357             if (use_half)
358             {
359                 convertFp16(srcMat, srcMat_fp32);
360                 convertFp16(dstMat, dstMat_fp32);
361             }
362             else
363             {
364                 srcMat_fp32 = srcMat;
365                 dstMat_fp32 = dstMat;
366             }
367
368             cv::gemm(srcMat_fp32, weights, 1, noArray(), 0, dstMat_fp32, GEMM_2_T);
369
370             if (bias)
371             {
372                 UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
373                 UMat& biases = umat_blobs[1];
374                 cv::gemm(biasOnesMat, biases, 1, dstMat_fp32, 1, dstMat_fp32, 0);
375             }
376             if (use_half)
377             {
378                 convertFp16(srcMat_fp32, srcMat);
379                 convertFp16(dstMat_fp32, dstMat);
380             }
381         }
382
383         return true;
384     }
385 #endif
386
387     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
388     {
389         CV_TRACE_FUNCTION();
390         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
391
392         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
393                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
394
395         if (inputs_arr.depth() == CV_16S)
396         {
397             forward_fallback(inputs_arr, outputs_arr, internals_arr);
398             return;
399         }
400
401         std::vector<Mat> input, output;
402         inputs_arr.getMatVector(input);
403         outputs_arr.getMatVector(output);
404
405         int axisCan = clamp(axis, input[0].dims);
406         int outerSize = input[0].total(0, axisCan);
407
408         for (size_t i = 0; i < input.size(); i++)
409         {
410             Mat srcMat = input[i].reshape(1, outerSize);
411             Mat dstMat = output[i].reshape(1, outerSize);
412
413             const int nstripes = getNumThreads();
414             FullyConnected::run(srcMat, weightsMat, biasMat, dstMat, activ.get(), nstripes);
415         }
416     }
417
418     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
419     {
420 #ifdef HAVE_HALIDE
421         int inW, inH, inC, inN, outC = blobs[0].size[0];
422         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
423         getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
424         auto weights = wrapToHalideBuffer(blobs[0], {inW, inH, inC, outC});
425
426         Halide::Var x("x"), y("y"), c("c"), n("n");
427         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
428         Halide::RDom r(0, inW, 0, inH, 0, inC);
429         Halide::Expr topExpr = sum(inputBuffer(r.x, r.y, r.z, n) *
430                                    weights(r.x, r.y, r.z, c));
431         if (bias)
432         {
433             Halide::Buffer<float> bias = wrapToHalideBuffer(blobs[1], {outC});
434             topExpr += bias(c);
435         }
436         top(x, y, c, n) = topExpr;
437         return Ptr<BackendNode>(new HalideBackendNode(top));
438 #endif  // HAVE_HALIDE
439         return Ptr<BackendNode>();
440     }
441
442 #ifdef HAVE_INF_ENGINE
443     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
444     {
445         InferenceEngine::Builder::FullyConnectedLayer ieLayer(name);
446
447         const int outNum = blobs[0].size[0];
448         ieLayer.setOutputNum(outNum);
449
450         InferenceEngine::Builder::Layer l = ieLayer;
451         addConstantData("weights", wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW), l);
452         if (bias)
453             addConstantData("biases", wrapToInfEngineBlob(blobs[1], {(size_t)outNum}, InferenceEngine::Layout::C), l);
454
455         return Ptr<BackendNode>(new InfEngineBackendNode(l));
456     }
457 #endif  // HAVE_INF_ENGINE
458
459     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
460                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
461     {
462         CV_UNUSED(inputs); // suppress unused variable warning
463         long flops = 0;
464
465         int innerSize = blobs[0].size[1];
466         for(int i = 0; i < outputs.size(); i++)
467         {
468             flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
469         }
470
471         return flops;
472
473     }
474
475     bool bias;
476     Mat weightsMat, biasMat;
477     Ptr<ActivationLayer> activ;
478 };
479
480 Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
481 {
482     return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(params));
483 }
484
485 }
486 }