Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
[platform/upstream/opencv.git] / modules / dnn / src / layers / fully_connected_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_cuda.hpp"
46 #include "../op_halide.hpp"
47 #include "../op_inf_engine.hpp"
48 #include <opencv2/dnn/shape_utils.hpp>
49
50 #ifdef HAVE_OPENCL
51 #include "opencl_kernels_dnn.hpp"
52 using namespace cv::dnn::ocl4dnn;
53 #endif
54
55 #ifdef HAVE_CUDA
56 #include "../cuda4dnn/primitives/inner_product.hpp"
57 using namespace cv::dnn::cuda4dnn;
58 #endif
59
60 namespace cv
61 {
62 namespace dnn
63 {
64
65 class FullyConnectedLayerImpl CV_FINAL : public InnerProductLayer
66 {
67 public:
68     enum { VEC_ALIGN = 8 };
69
70 #ifdef HAVE_OPENCL
71     Ptr<OCL4DNNInnerProduct<float> > innerProductOp;
72     std::vector<UMat> umat_blobs;
73     std::vector<UMat> half_blobs;
74 #endif
75
76     FullyConnectedLayerImpl(const LayerParams& params)
77     {
78         setParamsFrom(params);
79         CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
80
81         int numOutput = params.get<int>("num_output");
82         int innerSize = (int)blobs[0].total() / numOutput;
83         bias = params.get<bool>("bias_term", true);
84         axis = params.get<int>("axis", 1);
85
86         CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
87         CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
88
89         weightsMat = blobs[0] = blobs[0].reshape(1, numOutput);
90         int vecsize = weightsMat.cols;
91         if( vecsize % VEC_ALIGN != 0 )
92         {
93             int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
94             Mat weightsBuf(weightsMat.rows, vecsize_aligned, weightsMat.type());
95             Mat wpadding = weightsBuf.colRange(vecsize, vecsize_aligned);
96             wpadding.setTo(Scalar::all(0.));
97             weightsMat = weightsBuf.colRange(0, vecsize);
98             blobs[0].copyTo(weightsMat);
99         }
100
101         if (bias)
102             biasMat = blobs[1] = blobs[1].reshape(1, 1);
103         else
104             biasMat = Mat::zeros(1, numOutput, weightsMat.type());
105     }
106
107     bool getMemoryShapes(const std::vector<MatShape> &inputs,
108                          const int requiredOutputs,
109                          std::vector<MatShape> &outputs,
110                          std::vector<MatShape> &) const CV_OVERRIDE
111     {
112         CV_Assert(inputs.size() == 1);
113         CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
114         CV_Assert(blobs[0].dims == 2);
115
116         int cAxis = clamp(axis, inputs[0]);
117         int numOutput = blobs[0].size[0];
118         MatShape outShape(cAxis + 1);
119         for (int i = 0; i < cAxis; ++i)
120             outShape[i] = inputs[0][i];
121         outShape.back() = numOutput;
122
123         outputs.resize(inputs.size(), outShape);
124
125         CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
126         return false;
127     }
128
129     virtual bool supportBackend(int backendId) CV_OVERRIDE
130     {
131         return backendId == DNN_BACKEND_OPENCV ||
132                backendId == DNN_BACKEND_CUDA ||
133                (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1) ||
134                (backendId == DNN_BACKEND_INFERENCE_ENGINE && haveInfEngine() && axis == 1);
135     }
136
137     virtual bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
138     {
139         if (activ.empty() || layer.empty())
140         {
141             activ = layer;
142             return !activ.empty();
143         }
144         else
145             return false;
146     }
147
148     class FullyConnected : public ParallelLoopBody
149     {
150     public:
151         FullyConnected() : srcMat(0), weights(0), biasMat(0), activ(0), dstMat(0), nstripes(0), useAVX(false), useAVX2(false), useAVX512(false) {}
152
153         static void run(const Mat& srcMat, const Mat& weights, const Mat& biasMat,
154                         Mat& dstMat, const ActivationLayer* activ, int nstripes)
155         {
156             CV_Assert( srcMat.dims == 2 && srcMat.cols == weights.cols &&
157                        dstMat.rows == srcMat.rows && dstMat.cols == weights.rows &&
158                        srcMat.type() == weights.type() && weights.type() == dstMat.type() &&
159                        srcMat.type() == CV_32F &&
160                        (biasMat.empty() || (biasMat.type() == srcMat.type() &&
161                                            biasMat.isContinuous() && (int)biasMat.total() == dstMat.cols)) );
162
163             FullyConnected p;
164
165             p.srcMat = &srcMat;
166             p.weights = &weights;
167             p.biasMat = &biasMat;
168             p.dstMat = &dstMat;
169             p.nstripes = nstripes;
170             p.activ = activ;
171             p.useAVX = checkHardwareSupport(CPU_AVX);
172             p.useAVX2 = checkHardwareSupport(CPU_AVX2);
173             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX;
174
175             parallel_for_(Range(0, nstripes), p, nstripes);
176         }
177
178         void operator()(const Range& r) const CV_OVERRIDE
179         {
180             int valign = FullyConnectedLayerImpl::VEC_ALIGN;
181             int nsamples = srcMat->rows;
182             int nw0 = weights->rows;
183             int k, vecsize = srcMat->cols;
184             int vecsize_aligned = (int)alignSize(vecsize, VEC_ALIGN);
185             size_t total = (size_t)nsamples*nw0;
186             size_t stripeSize = (total + nstripes - 1)/nstripes;
187             size_t stripeStart = r.start*stripeSize;
188             size_t stripeEnd = r.end == nstripes ? total : std::min(r.end*stripeSize, total);
189             size_t wstep = weights->step1();
190             AutoBuffer<float> srcbuf(vecsize_aligned + valign);
191             float* sptr = alignPtr(srcbuf.data(), (int)(valign*sizeof(float)));
192
193             for( k = vecsize; k < vecsize_aligned; k++ )
194                 sptr[k] = 0.f;
195
196             for( size_t ofs = stripeStart; ofs < stripeEnd; )
197             {
198                 int sampleIdx = (int)(ofs / nw0);
199                 int delta = (int)(ofs - (size_t)sampleIdx*nw0);
200                 const float* sptr_ = srcMat->ptr<float>(sampleIdx);
201                 const float* wptr = weights->ptr<float>(delta);
202                 float* dptr = dstMat->ptr<float>(sampleIdx) + delta;
203                 const float* biasptr = biasMat->ptr<float>() + delta;
204                 int nw = std::min(nw0 - delta, (int)(stripeEnd - ofs));
205
206                 memcpy(sptr, sptr_, vecsize*sizeof(sptr[0]));
207
208             #if CV_TRY_AVX512_SKX
209                 if( useAVX512 )
210                     opt_AVX512_SKX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
211                 else
212             #endif
213             #if CV_TRY_AVX2
214                 if( useAVX2 )
215                     opt_AVX2::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
216                 else
217             #endif
218             #if CV_TRY_AVX
219                 if( useAVX )
220                     opt_AVX::fastGEMM1T( sptr, wptr, wstep, biasptr, dptr, nw, vecsize);
221                 else
222             #endif
223                 {
224                     int i = 0;
225
226             #if CV_SIMD128
227                     for( ; i <= nw - 4; i += 4, wptr += 4*wstep )
228                     {
229                         v_float32x4 vs0 = v_setall_f32(0.f), vs1 = v_setall_f32(0.f);
230                         v_float32x4 vs2 = v_setall_f32(0.f), vs3 = v_setall_f32(0.f);
231
232                         for( k = 0; k < vecsize; k += 4 )
233                         {
234                             v_float32x4 v = v_load_aligned(sptr + k);
235                             vs0 += v*v_load_aligned(wptr + k);
236                             vs1 += v*v_load_aligned(wptr + wstep + k);
237                             vs2 += v*v_load_aligned(wptr + wstep*2 + k);
238                             vs3 += v*v_load_aligned(wptr + wstep*3 + k);
239                         }
240
241                         v_float32x4 s = v_reduce_sum4(vs0, vs1, vs2, vs3);
242                         s += v_load(biasptr + i);
243                         v_store(dptr + i, s);
244                     }
245             #endif
246
247                     for( ; i < nw; i++, wptr += wstep )
248                     {
249                         float s0=biasptr[i];
250
251                         for( k = 0; k < vecsize; k++ )
252                         {
253                             float v = sptr[k];
254                             s0 += v*wptr[k];
255                         }
256                         dptr[i] = s0;
257                     }
258                 }
259
260                 if(activ)
261                     activ->forwardSlice(dptr, dptr, 1, 1, delta, delta + nw);
262
263                 ofs += nw;
264             }
265         }
266
267         const Mat *srcMat, *weights, *biasMat;
268         const ActivationLayer* activ;
269         Mat* dstMat;
270         int nstripes;
271         bool useAVX;
272         bool useAVX2;
273         bool useAVX512;
274     };
275
276 #ifdef HAVE_OPENCL
277     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
278     {
279         innerProductOp.release();
280         umat_blobs.clear();
281         half_blobs.clear();
282     }
283
284     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
285     {
286         std::vector<UMat> inputs;
287         std::vector<UMat> outputs;
288
289         bool use_half = (inps.depth() == CV_16S);
290         inps.getUMatVector(inputs);
291         outs.getUMatVector(outputs);
292
293         int axisCan = clamp(axis, inputs[0].dims);
294         int numOutput = blobs[0].size[0];
295         int innerSize = blobs[0].size[1];
296         int outerSize = total(shape(inputs[0]), 0, axisCan);
297         bool ret = true;
298
299         if (innerProductOp.empty())
300         {
301             size_t n = blobs.size();
302             umat_blobs.resize(n);
303             for (int i = 0; i < n; i++) blobs[i].copyTo(umat_blobs[i]);
304
305             OCL4DNNInnerProductConfig config;
306             config.num_output = numOutput;
307             config.bias_term = bias;
308             config.M = outerSize;
309             config.K = innerSize;
310             config.use_half = use_half;
311
312             if (use_half)
313             {
314                 half_blobs.resize(umat_blobs.size());
315                 for (int i = 0; i < umat_blobs.size(); i++)
316                 {
317                     if (!umat_blobs[i].empty())
318                         convertFp16(umat_blobs[i], half_blobs[i]);
319                 }
320             }
321
322             innerProductOp = Ptr<OCL4DNNInnerProduct<float> >(new OCL4DNNInnerProduct<float>(config));
323         }
324
325         for (size_t i = 0; i < inputs.size(); i++)
326         {
327             MatShape inshape, outshape;
328             inshape = shape(outerSize, innerSize);
329             outshape = shape(outerSize, numOutput);
330
331             UMat srcMat, dstMat;
332             srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
333             dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
334
335             if (!innerProductOp->Forward(srcMat, (use_half) ? half_blobs[0] : umat_blobs[0],
336                                          (bias) ? (use_half ? half_blobs[1] : umat_blobs[1]) : UMat(),
337                                          dstMat))
338             {
339                 ret = false;
340                 break;
341             }
342
343             if (!use_half && bias && (outerSize > 1))
344             {
345                 UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
346                 UMat& biases = umat_blobs[1];
347                 cv::gemm(biasOnesMat, biases, 1, dstMat, 1, dstMat, 0);
348             }
349         }
350
351         if (ret) return true;
352
353         UMat& weights = umat_blobs[0];
354         for (size_t i = 0; i < inputs.size(); i++)
355         {
356             MatShape inshape, outshape;
357             inshape = shape(outerSize, innerSize);
358             outshape = shape(outerSize, numOutput);
359
360             UMat srcMat, dstMat, srcMat_fp32, dstMat_fp32;
361             srcMat = inputs[i].reshape(1, inshape.size(), &inshape[0]);
362             dstMat = outputs[i].reshape(1, outshape.size(), &outshape[0]);
363
364             if (use_half)
365             {
366                 convertFp16(srcMat, srcMat_fp32);
367                 convertFp16(dstMat, dstMat_fp32);
368             }
369             else
370             {
371                 srcMat_fp32 = srcMat;
372                 dstMat_fp32 = dstMat;
373             }
374
375             cv::gemm(srcMat_fp32, weights, 1, noArray(), 0, dstMat_fp32, GEMM_2_T);
376
377             if (bias)
378             {
379                 UMat biasOnesMat = UMat::ones(outerSize, 1, umat_blobs[0].type());
380                 UMat& biases = umat_blobs[1];
381                 cv::gemm(biasOnesMat, biases, 1, dstMat_fp32, 1, dstMat_fp32, 0);
382             }
383             if (use_half)
384             {
385                 convertFp16(srcMat_fp32, srcMat);
386                 convertFp16(dstMat_fp32, dstMat);
387             }
388         }
389
390         return true;
391     }
392 #endif
393
394     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
395     {
396         CV_TRACE_FUNCTION();
397         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
398
399         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
400                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
401
402         if (inputs_arr.depth() == CV_16S)
403         {
404             forward_fallback(inputs_arr, outputs_arr, internals_arr);
405             return;
406         }
407
408         std::vector<Mat> input, output;
409         inputs_arr.getMatVector(input);
410         outputs_arr.getMatVector(output);
411
412         int axisCan = clamp(axis, input[0].dims);
413         int outerSize = input[0].total(0, axisCan);
414
415         for (size_t i = 0; i < input.size(); i++)
416         {
417             Mat srcMat = input[i].reshape(1, outerSize);
418             Mat dstMat = output[i].reshape(1, outerSize);
419
420             const int nstripes = getNumThreads();
421             FullyConnected::run(srcMat, weightsMat, biasMat, dstMat, activ.get(), nstripes);
422         }
423     }
424
425 #ifdef HAVE_CUDA
426     Ptr<BackendNode> initCUDA(
427         void *context_,
428         const std::vector<Ptr<BackendWrapper>>& inputs,
429         const std::vector<Ptr<BackendWrapper>>& outputs
430     ) override
431     {
432         auto context = reinterpret_cast<csl::CSLContext*>(context_);
433
434         auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
435
436         auto flatten_start_axis = clamp(axis, input_wrapper->getRank());
437
438         auto biasMat_ = bias ? biasMat : Mat();
439         return make_cuda_node<cuda4dnn::InnerProductOp>(preferableTarget, std::move(context->stream), std::move(context->cublas_handle), flatten_start_axis, weightsMat, biasMat_);
440     }
441 #endif
442
443     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
444     {
445 #ifdef HAVE_HALIDE
446         int inW, inH, inC, inN, outC = blobs[0].size[0];
447         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
448         getCanonicalSize(inputBuffer, &inW, &inH, &inC, &inN);
449         auto weights = wrapToHalideBuffer(blobs[0], {inW, inH, inC, outC});
450
451         Halide::Var x("x"), y("y"), c("c"), n("n");
452         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
453         Halide::RDom r(0, inW, 0, inH, 0, inC);
454         Halide::Expr topExpr = sum(inputBuffer(r.x, r.y, r.z, n) *
455                                    weights(r.x, r.y, r.z, c));
456         if (bias)
457         {
458             Halide::Buffer<float> bias = wrapToHalideBuffer(blobs[1], {outC});
459             topExpr += bias(c);
460         }
461         top(x, y, c, n) = topExpr;
462         return Ptr<BackendNode>(new HalideBackendNode(top));
463 #endif  // HAVE_HALIDE
464         return Ptr<BackendNode>();
465     }
466
467 #ifdef HAVE_INF_ENGINE
468     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
469     {
470         InferenceEngine::Builder::FullyConnectedLayer ieLayer(name);
471
472         const int outNum = blobs[0].size[0];
473         ieLayer.setOutputNum(outNum);
474
475         InferenceEngine::Builder::Layer l = ieLayer;
476         addConstantData("weights", wrapToInfEngineBlob(blobs[0], {(size_t)blobs[0].size[0], (size_t)blobs[0].size[1], 1, 1}, InferenceEngine::Layout::OIHW), l);
477         if (bias)
478             addConstantData("biases", wrapToInfEngineBlob(blobs[1], {(size_t)outNum}, InferenceEngine::Layout::C), l);
479
480         return Ptr<BackendNode>(new InfEngineBackendNode(l));
481     }
482 #endif  // HAVE_INF_ENGINE
483
484     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
485                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
486     {
487         CV_UNUSED(inputs); // suppress unused variable warning
488         long flops = 0;
489
490         int innerSize = blobs[0].size[1];
491         for(int i = 0; i < outputs.size(); i++)
492         {
493             flops += CV_BIG_INT(3)*innerSize*total(outputs[i]);
494         }
495
496         return flops;
497
498     }
499
500     bool bias;
501     Mat weightsMat, biasMat;
502     Ptr<ActivationLayer> activ;
503 };
504
505 Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
506 {
507     return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(params));
508 }
509
510 }
511 }