51a2ea241123f27716e8bf619d1a9c1c16c36645
[platform/upstream/opencv.git] / modules / dnn / src / layers / pooling_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "op_halide.hpp"
47 #include "opencl_kernels_dnn.hpp"
48 #include <float.h>
49 #include <algorithm>
50 using std::max;
51 using std::min;
52 #ifdef HAVE_OPENCL
53 using namespace cv::dnn::ocl4dnn;
54 #endif
55
56 namespace cv
57 {
58 namespace dnn
59 {
60 static inline int scaleAndRoundRoi(float f, float scale)
61 {
62     return (int)(f * scale + (f >= 0.f ? 0.5f : -0.5f));
63 }
64
65 class PoolingLayerImpl : public PoolingLayer
66 {
67 public:
68     PoolingLayerImpl(const LayerParams& params)
69     {
70         type = MAX;
71         computeMaxIdx = true;
72         globalPooling = false;
73
74         if (params.has("pool"))
75         {
76             String pool = params.get<String>("pool").toLowerCase();
77             if (pool == "max")
78                 type = MAX;
79             else if (pool == "ave")
80                 type = AVE;
81             else if (pool == "stochastic")
82                 type = STOCHASTIC;
83             else
84                 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
85             getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
86                                    pad.height, pad.width, stride.height, stride.width, padMode);
87         }
88         else if (params.has("pooled_w") || params.has("pooled_h") || params.has("spatial_scale"))
89         {
90             type = ROI;
91         }
92         setParamsFrom(params);
93         ceilMode = params.get<bool>("ceil_mode", true);
94         pooledSize.width = params.get<uint32_t>("pooled_w", 1);
95         pooledSize.height = params.get<uint32_t>("pooled_h", 1);
96         spatialScale = params.get<float>("spatial_scale", 1);
97     }
98
99 #ifdef HAVE_OPENCL
100     Ptr<OCL4DNNPool<float> > poolOp;
101 #endif
102
103     void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
104     {
105         CV_Assert(!inputs.empty());
106
107         cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
108                 out(outputs[0].size[3], outputs[0].size[2]);
109
110         if(globalPooling)
111         {
112             kernel = inp;
113         }
114
115         getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad);
116     }
117
118     virtual bool supportBackend(int backendId)
119     {
120         return backendId == DNN_BACKEND_DEFAULT ||
121                backendId == DNN_BACKEND_HALIDE && haveHalide() &&
122                (type == MAX || type == AVE && !pad.width && !pad.height);
123     }
124
125 #ifdef HAVE_OPENCL
126     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
127     {
128         std::vector<UMat> inputs;
129         std::vector<UMat> outputs;
130
131         inps.getUMatVector(inputs);
132         outs.getUMatVector(outputs);
133
134         if (poolOp.empty())
135         {
136             OCL4DNNPoolConfig config;
137
138             config.in_shape = shape(inputs[0]);
139             config.out_shape = shape(outputs[0]);
140             config.kernel = kernel;
141             config.pad = pad;
142             config.stride = stride;
143             config.channels = inputs[0].size[1];
144             config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
145                                 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
146                                                LIBDNN_POOLING_METHOD_STO);
147             poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
148         }
149
150         for (size_t ii = 0; ii < inputs.size(); ii++)
151         {
152             UMat& inpMat = inputs[ii];
153             int out_index = (type == MAX) ? 2 : 1;
154             UMat& outMat = outputs[out_index * ii];
155             UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat();
156
157             CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
158
159             if (!poolOp->Forward(inpMat, outMat, maskMat))
160                 return false;
161         }
162
163         return true;
164     }
165 #endif
166
167     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
168     {
169         CV_TRACE_FUNCTION();
170         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
171
172         CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
173                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
174                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
175
176         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
177     }
178
179     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
180     {
181         CV_TRACE_FUNCTION();
182         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
183
184         switch (type)
185         {
186             case MAX:
187                 CV_Assert(inputs.size() == 1, outputs.size() == 2);
188                 maxPooling(*inputs[0], outputs[0], outputs[1]);
189                 break;
190             case AVE:
191                 CV_Assert(inputs.size() == 1, outputs.size() == 1);
192                 avePooling(*inputs[0], outputs[0]);
193                 break;
194             case ROI:
195                 CV_Assert(inputs.size() == 2, outputs.size() == 1);
196                 roiPooling(*inputs[0], *inputs[1], outputs[0]);
197                 break;
198             default:
199                 CV_Error(Error::StsNotImplemented, "Not implemented");
200                 break;
201         }
202     }
203
204     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
205     {
206         if (type == MAX)
207             return initMaxPoolingHalide(inputs);
208         else if (type == AVE)
209             return initAvePoolingHalide(inputs);
210         else
211             return Ptr<BackendNode>();
212     }
213
214     class PoolingInvoker : public ParallelLoopBody
215     {
216     public:
217         const Mat* src, *rois;
218         Mat *dst, *mask;
219         Size kernel, stride, pad;
220         int nstripes;
221         bool computeMaxIdx;
222         std::vector<int> ofsbuf;
223         int poolingType;
224         float spatialScale;
225
226         PoolingInvoker() : src(0), rois(0), dst(0), mask(0), nstripes(0),
227                            computeMaxIdx(0), poolingType(MAX), spatialScale(0) {}
228
229         static void run(const Mat& src, const Mat& rois, Mat& dst, Mat& mask, Size kernel,
230                         Size stride, Size pad, int poolingType, float spatialScale,
231                         bool computeMaxIdx, int nstripes)
232         {
233             CV_Assert(src.isContinuous() && dst.isContinuous() &&
234                       src.type() == CV_32F && src.type() == dst.type() &&
235                       src.dims == 4 && dst.dims == 4 &&
236                       (poolingType == ROI && dst.size[0] == rois.size[0] ||
237                        src.size[0] == dst.size[0]) && src.size[1] == dst.size[1] &&
238                       (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
239
240             PoolingInvoker p;
241
242             p.src = &src;
243             p.rois = &rois;
244             p.dst = &dst;
245             p.mask = &mask;
246             p.kernel = kernel;
247             p.stride = stride;
248             p.pad = pad;
249             p.nstripes = nstripes;
250             p.computeMaxIdx = computeMaxIdx;
251             p.poolingType = poolingType;
252             p.spatialScale = spatialScale;
253
254             if( !computeMaxIdx )
255             {
256                 p.ofsbuf.resize(kernel.width*kernel.height);
257                 for( int i = 0; i < kernel.height; i++ )
258                     for( int j = 0; j < kernel.width; j++ )
259                         p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
260             }
261
262             parallel_for_(Range(0, nstripes), p, nstripes);
263         }
264
265         void operator()(const Range& r) const
266         {
267             int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
268             int inp_width = src->size[3], inp_height = src->size[2];
269             size_t total = dst->total();
270             size_t stripeSize = (total + nstripes - 1)/nstripes;
271             size_t stripeStart = r.start*stripeSize;
272             size_t stripeEnd = std::min(r.end*stripeSize, total);
273             int kernel_w = kernel.width, kernel_h = kernel.height;
274             int pad_w = pad.width, pad_h = pad.height;
275             int stride_w = stride.width, stride_h = stride.height;
276             bool compMaxIdx = computeMaxIdx;
277
278 #if CV_SIMD128
279             const int* ofsptr = &ofsbuf[0];
280             v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
281             v_float32x4 ones = v_setall_f32(1.f);
282             v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
283 #endif
284
285             for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
286             {
287                 size_t ofs = ofs0;
288                 int x0 = (int)(ofs % width);
289                 ofs /= width;
290                 int y0 = (int)(ofs % height);
291                 ofs /= height;
292                 int c = (int)(ofs % channels);
293                 int n = (int)(ofs / channels);
294                 int ystart, yend;
295
296                 const float *srcData;
297                 int xstartROI = 0;
298                 float roiRatio = 0;
299                 if (poolingType == ROI)
300                 {
301                     const float *roisData = rois->ptr<float>(n);
302                     int ystartROI = scaleAndRoundRoi(roisData[2], spatialScale);
303                     int yendROI = scaleAndRoundRoi(roisData[4], spatialScale);
304                     int roiHeight = std::max(yendROI - ystartROI + 1, 1);
305                     roiRatio = (float)roiHeight / height;
306
307                     ystart = ystartROI + y0 * roiRatio;
308                     yend = ystartROI + std::ceil((y0 + 1) * roiRatio);
309
310                     xstartROI = scaleAndRoundRoi(roisData[1], spatialScale);
311                     int xendROI = scaleAndRoundRoi(roisData[3], spatialScale);
312                     int roiWidth = std::max(xendROI - xstartROI + 1, 1);
313                     roiRatio = (float)roiWidth / width;
314
315                     CV_Assert(roisData[0] < src->size[0]);
316                     srcData = src->ptr<float>(roisData[0], c);
317                 }
318                 else
319                 {
320                     ystart = y0 * stride_h - pad_h;
321                     yend = min(ystart + kernel_h, inp_height + pad_h);
322                     srcData = src->ptr<float>(n, c);
323                 }
324                 int ydelta = yend - ystart;
325                 ystart = max(ystart, 0);
326                 yend = min(yend, inp_height);
327                 float *dstData = dst->ptr<float>(n, c, y0);
328                 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
329
330                 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
331                 ofs0 += delta;
332                 int x1 = x0 + delta;
333
334                 if( poolingType == MAX || poolingType == ROI)
335                     for( ; x0 < x1; x0++ )
336                     {
337                         int xstart, xend;
338                         if (poolingType == ROI)
339                         {
340                             xstart = xstartROI + x0 * roiRatio;
341                             xend = xstartROI + std::ceil((x0 + 1) * roiRatio);
342                         }
343                         else
344                         {
345                             xstart = x0 * stride_w - pad_w;
346                             xend = xstart + kernel_w;
347                         }
348                         xstart = max(xstart, 0);
349                         xend = min(xend, inp_width);
350                         if (xstart >= xend || ystart >= yend)
351                         {
352                             dstData[x0] = 0;
353                             if (compMaxIdx && dstMaskData)
354                                 dstMaskData[x0] = -1;
355                             continue;
356                         }
357 #if CV_SIMD128
358                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
359                         {
360                             if( compMaxIdx )
361                             {
362                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
363                                 v_float32x4 max_val1 = max_val0;
364                                 v_float32x4 max_idx0 = v_setall_f32(-1.f);
365                                 v_float32x4 max_idx1 = max_idx0;
366                                 int index0 = ystart * inp_width + xstart;
367                                 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
368                                 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
369
370                                 for (int y = ystart; y < yend; ++y)
371                                 {
372                                     for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
373                                     {
374                                         const int index = y * inp_width + x;
375                                         v_float32x4 v0(srcData[index], srcData[index + stride_w],
376                                                        srcData[index + stride_w*2], srcData[index + stride_w*3]);
377                                         v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
378                                                        srcData[index + stride_w*6], srcData[index + stride_w*7]);
379                                         max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
380                                         max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
381                                         max_val0 = v_max(max_val0, v0);
382                                         max_val1 = v_max(max_val1, v1);
383                                     }
384                                     idx0 += idx_delta;
385                                     idx1 += idx_delta;
386                                 }
387                                 v_store(dstData + x0, max_val0);
388                                 v_store(dstData + x0 + 4, max_val1);
389                                 if (dstMaskData)
390                                 {
391                                     v_store(dstMaskData + x0, max_idx0);
392                                     v_store(dstMaskData + x0 + 4, max_idx1);
393                                 }
394                                 x0 += 7;
395                             }
396                             else
397                             {
398                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
399                                 v_float32x4 max_val1 = max_val0;
400
401                                 if( yend - ystart == kernel_h )
402                                 {
403                                     const float* srcData1 = srcData + ystart*inp_width + xstart;
404                                     if( stride_w == 1 )
405                                         for (int k = 0; k < kernel_w*kernel_h; k++)
406                                         {
407                                             int index = ofsptr[k];
408                                             v_float32x4 v0 = v_load(srcData1 + index);
409                                             v_float32x4 v1 = v_load(srcData1 + index + 4);
410                                             max_val0 = v_max(max_val0, v0);
411                                             max_val1 = v_max(max_val1, v1);
412                                         }
413 #if CV_SSE2
414                                     else if( stride_w == 2 )
415                                         for (int k = 0; k < kernel_w*kernel_h; k++)
416                                         {
417                                             int index = ofsptr[k];
418                                             v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
419                                             v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
420                                             v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
421                                             v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
422                                             max_val0 = v_max(max_val0, v0);
423                                             max_val1 = v_max(max_val1, v1);
424                                         }
425 #endif
426                                     else
427                                         for (int k = 0; k < kernel_w*kernel_h; k++)
428                                         {
429                                             int index = ofsptr[k];
430                                             v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
431                                                            srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
432                                             v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
433                                                            srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
434                                             max_val0 = v_max(max_val0, v0);
435                                             max_val1 = v_max(max_val1, v1);
436                                         }
437                                 }
438                                 else
439                                 {
440                                     for (int y = ystart; y < yend; ++y)
441                                     {
442                                         for (int x = xstart; x < xend; ++x)
443                                         {
444                                             const int index = y * inp_width + x;
445                                             v_float32x4 v0(srcData[index], srcData[index + stride_w],
446                                                            srcData[index + stride_w*2], srcData[index + stride_w*3]);
447                                             v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
448                                                            srcData[index + stride_w*6], srcData[index + stride_w*7]);
449                                             max_val0 = v_max(max_val0, v0);
450                                             max_val1 = v_max(max_val1, v1);
451                                         }
452                                     }
453                                 }
454                                 v_store(dstData + x0, max_val0);
455                                 v_store(dstData + x0 + 4, max_val1);
456                                 x0 += 7;
457                             }
458                         }
459                         else
460 #endif
461                         {
462                             float max_val = -FLT_MAX;
463                             if( compMaxIdx )
464                             {
465                                 int max_index = -1;
466                                 for (int y = ystart; y < yend; ++y)
467                                     for (int x = xstart; x < xend; ++x)
468                                     {
469                                         const int index = y * inp_width + x;
470                                         float val = srcData[index];
471                                         if (val > max_val)
472                                         {
473                                             max_val = val;
474                                             max_index = index;
475                                         }
476                                     }
477
478                                 dstData[x0] = max_val;
479                                 if (dstMaskData)
480                                     dstMaskData[x0] = max_index;
481                             }
482                             else
483                             {
484                                 for (int y = ystart; y < yend; ++y)
485                                     for (int x = xstart; x < xend; ++x)
486                                     {
487                                         const int index = y * inp_width + x;
488                                         float val = srcData[index];
489                                         max_val = std::max(max_val, val);
490                                     }
491
492                                 dstData[x0] = max_val;
493                             }
494                         }
495                     }
496                 else
497                 {
498                     for( ; x0 < x1; x0++ )
499                     {
500                         int xstart = x0 * stride_w - pad_w;
501                         int xend = min(xstart + kernel_w, inp_width + pad_w);
502                         int xdelta = xend - xstart;
503                         xstart = max(xstart, 0);
504                         xend = min(xend, inp_width);
505                         float inv_kernel_area = 1.f/(ydelta*xdelta);
506
507 #if CV_SIMD128
508                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
509                         {
510                             v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
511                             v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
512
513                             for (int y = ystart; y < yend; ++y)
514                             {
515                                 for (int x = xstart; x < xend; ++x)
516                                 {
517                                     const int index = y * inp_width + x;
518                                     v_float32x4 v0(srcData[index], srcData[index + stride_w],
519                                                    srcData[index + stride_w*2], srcData[index + stride_w*3]);
520                                     v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
521                                                    srcData[index + stride_w*6], srcData[index + stride_w*7]);
522                                     sum_val0 += v0;
523                                     sum_val1 += v1;
524                                 }
525                             }
526                             v_store(dstData + x0, sum_val0*ikarea);
527                             v_store(dstData + x0 + 4, sum_val1*ikarea);
528                             x0 += 7;
529                         }
530                         else
531 #endif
532                         {
533                             float sum_val = 0.f;
534                             for (int y = ystart; y < yend; ++y)
535                                 for (int x = xstart; x < xend; ++x)
536                                 {
537                                     const int index = y * inp_width + x;
538                                     float val = srcData[index];
539                                     sum_val += val;
540                                 }
541
542                             dstData[x0] = sum_val*inv_kernel_area;
543                         }
544                     }
545                 }
546             }
547         }
548     };
549
550     void maxPooling(Mat &src, Mat &dst, Mat &mask)
551     {
552         const int nstripes = getNumThreads();
553         Mat rois;
554         PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
555     }
556
557     void avePooling(Mat &src, Mat &dst)
558     {
559         const int nstripes = getNumThreads();
560         Mat rois, mask;
561         PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
562     }
563
564     void roiPooling(const Mat &src, const Mat &rois, Mat &dst)
565     {
566         const int nstripes = getNumThreads();
567         Mat mask;
568         PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
569     }
570
571     virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
572     {
573 #ifdef HAVE_HALIDE
574         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
575         const int inWidth = inputBuffer.width();
576         const int inHeight = inputBuffer.height();
577
578         Halide::Var x("x"), y("y"), c("c"), n("n");
579         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
580         Halide::RDom r(0, kernel.width, 0, kernel.height);
581         Halide::Expr kx, ky;
582         if (pad.width || pad.height)
583         {
584             kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
585             ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
586         }
587         else
588         {
589             kx = min(x * stride.width + r.x, inWidth - 1);
590             ky = min(y * stride.height + r.y, inHeight - 1);
591         }
592
593         // Halide::argmax returns tuple (r.x, r.y, max).
594         Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
595
596         // Compute offset from argmax in range [0, kernel_size).
597         Halide::Expr max_index;
598         if (pad.width || pad.height)
599         {
600             max_index = clamp(y * stride.height + res[1] - pad.height,
601                               0, inHeight - 1) * inWidth +
602                         clamp(x * stride.width + res[0] - pad.width,
603                               0, inWidth - 1);
604         }
605         else
606         {
607             max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
608                         min(x * stride.width + res[0], inWidth - 1);
609         }
610         top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
611         return Ptr<BackendNode>(new HalideBackendNode(top));
612 #endif  // HAVE_HALIDE
613         return Ptr<BackendNode>();
614     }
615
616     virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
617     {
618 #ifdef HAVE_HALIDE
619         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
620
621         const int inW = inputBuffer.width(), inH = inputBuffer.height();
622         if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
623         {
624             CV_Error(cv::Error::StsNotImplemented,
625                      "Halide backend for average pooling with partial "
626                      "kernels is not implemented");
627         }
628
629         const float norm = 1.0f / (kernel.width * kernel.height);
630
631         Halide::Var x("x"), y("y"), c("c"), n("n");
632         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
633         Halide::RDom r(0, kernel.width, 0, kernel.height);
634         top(x, y, c, n) = sum(
635             inputBuffer(x * stride.width + r.x,
636                         y * stride.height + r.y, c, n)) * norm;
637         return Ptr<BackendNode>(new HalideBackendNode(top));
638 #endif  // HAVE_HALIDE
639         return Ptr<BackendNode>();
640     }
641
642     virtual void applyHalideScheduler(Ptr<BackendNode>& node,
643                                       const std::vector<Mat*> &inputs,
644                                       const std::vector<Mat> &outputs,
645                                       int targetId) const
646     {
647 #ifdef  HAVE_HALIDE
648         if (targetId != DNN_TARGET_CPU)
649         {
650             Layer::applyHalideScheduler(node, inputs, outputs, targetId);
651             return;
652         }
653         Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
654                     xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
655         Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
656
657         int outW, outH, outC, outN;
658         getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
659
660         if (outW < 8 || outH < 8)
661         {
662             if (outC > 8)
663                 top.split(c, co, ci, 8)
664                    .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
665                    .parallel(tile)
666                    .vectorize(ci);
667             else
668             {
669                 top.fuse(y, c, tile).fuse(n, tile, tile)
670                    .parallel(tile);
671                 if (outW > 1)
672                     top.vectorize(x);
673             }
674         }
675         else
676         {
677             if (outC > 8)
678                 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
679                    .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
680                    .parallel(tile)
681                    .vectorize(xi);
682             else
683                 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
684                    .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
685                    .parallel(tile)
686                    .vectorize(xi);
687         }
688 #endif  // HAVE_HALIDE
689     }
690
691     bool getMemoryShapes(const std::vector<MatShape> &inputs,
692                          const int requiredOutputs,
693                          std::vector<MatShape> &outputs,
694                          std::vector<MatShape> &internals) const
695     {
696         CV_Assert(inputs.size() != 0);
697         Size in(inputs[0][3], inputs[0][2]), out;
698
699         if (globalPooling)
700         {
701             out.height = 1;
702             out.width = 1;
703         }
704         else if (type == ROI)
705         {
706             out.height = pooledSize.height;
707             out.width = pooledSize.width;
708         }
709         else if (padMode.empty())
710         {
711             float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height;
712             float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width;
713             out.height = 1 + (ceilMode ? ceil(height) : floor(height));
714             out.width = 1 + (ceilMode ? ceil(width) : floor(width));
715
716             if (pad.height || pad.width)
717             {
718                 // If we have padding, ensure that the last pooling starts strictly
719                 // inside the image (instead of at the padding); otherwise clip the last.
720                 if ((out.height - 1) * stride.height >= in.height + pad.height)
721                     --out.height;
722                 if ((out.width - 1) * stride.width >= in.width + pad.width)
723                     --out.width;
724                 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
725                 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
726             }
727         }
728         else
729         {
730             getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
731         }
732
733         int dims[] = {inputs[0][0], inputs[0][1], out.height, out.width};
734         if (type == ROI)
735         {
736             CV_Assert(inputs.size() == 2);
737             dims[0] = inputs[1][0];  // Number of proposals;
738         }
739         outputs.assign(type == MAX ? 2 : 1, shape(dims));
740         return false;
741     }
742
743     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
744                            const std::vector<MatShape> &outputs) const
745     {
746         (void)inputs; // suppress unused variable warning
747         long flops = 0;
748
749         for(int i = 0; i < outputs.size(); i++)
750         {
751             if (type == MAX)
752             {
753                 if (i%2 == 0)
754                     flops += total(outputs[i])*kernel.area();
755             }
756             else
757             {
758                 flops += total(outputs[i])*(kernel.area() + 1);
759             }
760         }
761         return flops;
762     }
763 private:
764     enum Type
765     {
766         MAX,
767         AVE,
768         STOCHASTIC,
769         ROI
770     };
771 };
772
773 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
774 {
775     return Ptr<PoolingLayer>(new PoolingLayerImpl(params));
776 }
777
778 }
779 }