Faster-RCNN models support
[platform/upstream/opencv.git] / modules / dnn / src / layers / pooling_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "op_halide.hpp"
47 #include "opencl_kernels_dnn.hpp"
48 #include <float.h>
49 #include <algorithm>
50 using std::max;
51 using std::min;
52 #ifdef HAVE_OPENCL
53 using namespace cv::dnn::ocl4dnn;
54 #endif
55
56 namespace cv
57 {
58 namespace dnn
59 {
60 static inline int scaleAndRoundRoi(float f, float scale)
61 {
62     return (int)(f * scale + (f >= 0.f ? 0.5f : -0.5f));
63 }
64
65 class PoolingLayerImpl : public PoolingLayer
66 {
67 public:
68     PoolingLayerImpl(const LayerParams& params)
69     {
70         type = MAX;
71         computeMaxIdx = true;
72         globalPooling = false;
73
74         if (params.has("pool"))
75         {
76             String pool = params.get<String>("pool").toLowerCase();
77             if (pool == "max")
78                 type = MAX;
79             else if (pool == "ave")
80                 type = AVE;
81             else if (pool == "stochastic")
82                 type = STOCHASTIC;
83             else
84                 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
85             getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
86                                    pad.height, pad.width, stride.height, stride.width, padMode);
87         }
88         else if (params.has("pooled_w") || params.has("pooled_h") || params.has("spatial_scale"))
89         {
90             type = ROI;
91             computeMaxIdx = false;
92         }
93         setParamsFrom(params);
94         ceilMode = params.get<bool>("ceil_mode", true);
95         pooledSize.width = params.get<uint32_t>("pooled_w", 1);
96         pooledSize.height = params.get<uint32_t>("pooled_h", 1);
97         spatialScale = params.get<float>("spatial_scale", 1);
98     }
99
100 #ifdef HAVE_OPENCL
101     Ptr<OCL4DNNPool<float> > poolOp;
102 #endif
103
104     void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
105     {
106         CV_Assert(!inputs.empty());
107
108         cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
109                 out(outputs[0].size[3], outputs[0].size[2]);
110
111         if(globalPooling)
112         {
113             kernel = inp;
114         }
115
116         getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad);
117     }
118
119     virtual bool supportBackend(int backendId)
120     {
121         return backendId == DNN_BACKEND_DEFAULT ||
122                backendId == DNN_BACKEND_HALIDE && haveHalide() &&
123                (type == MAX || type == AVE && !pad.width && !pad.height);
124     }
125
126 #ifdef HAVE_OPENCL
127     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
128     {
129         std::vector<UMat> inputs;
130         std::vector<UMat> outputs;
131
132         inps.getUMatVector(inputs);
133         outs.getUMatVector(outputs);
134
135         if (poolOp.empty())
136         {
137             OCL4DNNPoolConfig config;
138
139             config.in_shape = shape(inputs[0]);
140             config.out_shape = shape(outputs[0]);
141             config.kernel = kernel;
142             config.pad = pad;
143             config.stride = stride;
144             config.channels = inputs[0].size[1];
145             config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
146                                 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
147                                                LIBDNN_POOLING_METHOD_STO);
148             poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
149         }
150
151         for (size_t ii = 0; ii < inputs.size(); ii++)
152         {
153             UMat& inpMat = inputs[ii];
154             int out_index = (type == MAX) ? 2 : 1;
155             UMat& outMat = outputs[out_index * ii];
156             UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat();
157
158             CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
159
160             if (!poolOp->Forward(inpMat, outMat, maskMat))
161                 return false;
162         }
163
164         return true;
165     }
166 #endif
167
168     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
169     {
170         CV_TRACE_FUNCTION();
171         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
172
173         CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
174                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
175                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
176
177         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
178     }
179
180     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
181     {
182         CV_TRACE_FUNCTION();
183         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
184
185         switch (type)
186         {
187             case MAX:
188                 CV_Assert(inputs.size() == 1, outputs.size() == 2);
189                 maxPooling(*inputs[0], outputs[0], outputs[1]);
190                 break;
191             case AVE:
192                 CV_Assert(inputs.size() == 1, outputs.size() == 1);
193                 avePooling(*inputs[0], outputs[0]);
194                 break;
195             case ROI:
196                 CV_Assert(inputs.size() == 2, outputs.size() == 1);
197                 roiPooling(*inputs[0], *inputs[1], outputs[0]);
198                 break;
199             default:
200                 CV_Error(Error::StsNotImplemented, "Not implemented");
201                 break;
202         }
203     }
204
205     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
206     {
207         if (type == MAX)
208             return initMaxPoolingHalide(inputs);
209         else if (type == AVE)
210             return initAvePoolingHalide(inputs);
211         else
212             return Ptr<BackendNode>();
213     }
214
215     class PoolingInvoker : public ParallelLoopBody
216     {
217     public:
218         const Mat* src, *rois;
219         Mat *dst, *mask;
220         Size kernel, stride, pad;
221         int nstripes;
222         bool computeMaxIdx;
223         std::vector<int> ofsbuf;
224         int poolingType;
225         float spatialScale;
226
227         PoolingInvoker() : src(0), rois(0), dst(0), mask(0), nstripes(0),
228                            computeMaxIdx(0), poolingType(MAX), spatialScale(0) {}
229
230         static void run(const Mat& src, const Mat& rois, Mat& dst, Mat& mask, Size kernel,
231                         Size stride, Size pad, int poolingType, float spatialScale,
232                         bool computeMaxIdx, int nstripes)
233         {
234             CV_Assert(src.isContinuous() && dst.isContinuous() &&
235                       src.type() == CV_32F && src.type() == dst.type() &&
236                       src.dims == 4 && dst.dims == 4 &&
237                       (poolingType == ROI && dst.size[0] == rois.size[0] ||
238                        src.size[0] == dst.size[0]) && src.size[1] == dst.size[1] &&
239                       (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
240
241             PoolingInvoker p;
242
243             p.src = &src;
244             p.rois = &rois;
245             p.dst = &dst;
246             p.mask = &mask;
247             p.kernel = kernel;
248             p.stride = stride;
249             p.pad = pad;
250             p.nstripes = nstripes;
251             p.computeMaxIdx = computeMaxIdx;
252             p.poolingType = poolingType;
253             p.spatialScale = spatialScale;
254
255             if( !computeMaxIdx )
256             {
257                 p.ofsbuf.resize(kernel.width*kernel.height);
258                 for( int i = 0; i < kernel.height; i++ )
259                     for( int j = 0; j < kernel.width; j++ )
260                         p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
261             }
262
263             parallel_for_(Range(0, nstripes), p, nstripes);
264         }
265
266         void operator()(const Range& r) const
267         {
268             int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
269             int inp_width = src->size[3], inp_height = src->size[2];
270             size_t total = dst->total();
271             size_t stripeSize = (total + nstripes - 1)/nstripes;
272             size_t stripeStart = r.start*stripeSize;
273             size_t stripeEnd = std::min(r.end*stripeSize, total);
274             int kernel_w = kernel.width, kernel_h = kernel.height;
275             int pad_w = pad.width, pad_h = pad.height;
276             int stride_w = stride.width, stride_h = stride.height;
277             bool compMaxIdx = computeMaxIdx;
278
279 #if CV_SIMD128
280             const int* ofsptr = &ofsbuf[0];
281             v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
282             v_float32x4 ones = v_setall_f32(1.f);
283             v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
284 #endif
285
286             for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
287             {
288                 size_t ofs = ofs0;
289                 int x0 = (int)(ofs % width);
290                 ofs /= width;
291                 int y0 = (int)(ofs % height);
292                 ofs /= height;
293                 int c = (int)(ofs % channels);
294                 int n = (int)(ofs / channels);
295                 int ystart, yend;
296
297                 const float *srcData;
298                 if (poolingType == ROI)
299                 {
300                     const float *roisData = rois->ptr<float>(n);
301                     int ystartROI = scaleAndRoundRoi(roisData[2], spatialScale);
302                     int yendROI = scaleAndRoundRoi(roisData[4], spatialScale);
303                     int roiHeight = std::max(yendROI - ystartROI + 1, 1);
304                     float roiRatio = (float)roiHeight / height;
305
306                     ystart = ystartROI + y0 * roiRatio;
307                     yend = ystartROI + std::ceil((y0 + 1) * roiRatio);
308
309                     CV_Assert(roisData[0] < src->size[0]);
310                     srcData = src->ptr<float>(roisData[0], c);
311                 }
312                 else
313                 {
314                     ystart = y0 * stride_h - pad_h;
315                     yend = min(ystart + kernel_h, inp_height + pad_h);
316                     srcData = src->ptr<float>(n, c);
317                 }
318                 int ydelta = yend - ystart;
319                 ystart = max(ystart, 0);
320                 yend = min(yend, inp_height);
321                 float *dstData = dst->ptr<float>(n, c, y0);
322                 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
323
324                 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
325                 ofs0 += delta;
326                 int x1 = x0 + delta;
327
328                 if( poolingType == MAX)
329                     for( ; x0 < x1; x0++ )
330                     {
331                         int xstart = x0 * stride_w - pad_w;
332                         int xend = min(xstart + kernel_w, inp_width);
333                         xstart = max(xstart, 0);
334                         if (xstart >= xend || ystart >= yend)
335                         {
336                             dstData[x0] = 0;
337                             if (compMaxIdx && dstMaskData)
338                                 dstMaskData[x0] = -1;
339                             continue;
340                         }
341 #if CV_SIMD128
342                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
343                         {
344                             if( compMaxIdx )
345                             {
346                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
347                                 v_float32x4 max_val1 = max_val0;
348                                 v_float32x4 max_idx0 = v_setall_f32(-1.f);
349                                 v_float32x4 max_idx1 = max_idx0;
350                                 int index0 = ystart * inp_width + xstart;
351                                 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
352                                 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
353
354                                 for (int y = ystart; y < yend; ++y)
355                                 {
356                                     for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
357                                     {
358                                         const int index = y * inp_width + x;
359                                         v_float32x4 v0(srcData[index], srcData[index + stride_w],
360                                                        srcData[index + stride_w*2], srcData[index + stride_w*3]);
361                                         v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
362                                                        srcData[index + stride_w*6], srcData[index + stride_w*7]);
363                                         max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
364                                         max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
365                                         max_val0 = v_max(max_val0, v0);
366                                         max_val1 = v_max(max_val1, v1);
367                                     }
368                                     idx0 += idx_delta;
369                                     idx1 += idx_delta;
370                                 }
371                                 v_store(dstData + x0, max_val0);
372                                 v_store(dstData + x0 + 4, max_val1);
373                                 if (dstMaskData)
374                                 {
375                                     v_store(dstMaskData + x0, max_idx0);
376                                     v_store(dstMaskData + x0 + 4, max_idx1);
377                                 }
378                                 x0 += 7;
379                             }
380                             else
381                             {
382                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
383                                 v_float32x4 max_val1 = max_val0;
384
385                                 if( yend - ystart == kernel_h )
386                                 {
387                                     const float* srcData1 = srcData + ystart*inp_width + xstart;
388                                     if( stride_w == 1 )
389                                         for (int k = 0; k < kernel_w*kernel_h; k++)
390                                         {
391                                             int index = ofsptr[k];
392                                             v_float32x4 v0 = v_load(srcData1 + index);
393                                             v_float32x4 v1 = v_load(srcData1 + index + 4);
394                                             max_val0 = v_max(max_val0, v0);
395                                             max_val1 = v_max(max_val1, v1);
396                                         }
397 #if CV_SSE2
398                                     else if( stride_w == 2 )
399                                         for (int k = 0; k < kernel_w*kernel_h; k++)
400                                         {
401                                             int index = ofsptr[k];
402                                             v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
403                                             v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
404                                             v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
405                                             v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
406                                             max_val0 = v_max(max_val0, v0);
407                                             max_val1 = v_max(max_val1, v1);
408                                         }
409 #endif
410                                     else
411                                         for (int k = 0; k < kernel_w*kernel_h; k++)
412                                         {
413                                             int index = ofsptr[k];
414                                             v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
415                                                            srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
416                                             v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
417                                                            srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
418                                             max_val0 = v_max(max_val0, v0);
419                                             max_val1 = v_max(max_val1, v1);
420                                         }
421                                 }
422                                 else
423                                 {
424                                     for (int y = ystart; y < yend; ++y)
425                                     {
426                                         for (int x = xstart; x < xend; ++x)
427                                         {
428                                             const int index = y * inp_width + x;
429                                             v_float32x4 v0(srcData[index], srcData[index + stride_w],
430                                                            srcData[index + stride_w*2], srcData[index + stride_w*3]);
431                                             v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
432                                                            srcData[index + stride_w*6], srcData[index + stride_w*7]);
433                                             max_val0 = v_max(max_val0, v0);
434                                             max_val1 = v_max(max_val1, v1);
435                                         }
436                                     }
437                                 }
438                                 v_store(dstData + x0, max_val0);
439                                 v_store(dstData + x0 + 4, max_val1);
440                                 x0 += 7;
441                             }
442                         }
443                         else
444 #endif
445                         {
446                             float max_val = -FLT_MAX;
447                             if( compMaxIdx )
448                             {
449                                 int max_index = -1;
450                                 for (int y = ystart; y < yend; ++y)
451                                     for (int x = xstart; x < xend; ++x)
452                                     {
453                                         const int index = y * inp_width + x;
454                                         float val = srcData[index];
455                                         if (val > max_val)
456                                         {
457                                             max_val = val;
458                                             max_index = index;
459                                         }
460                                     }
461
462                                 dstData[x0] = max_val;
463                                 if (dstMaskData)
464                                     dstMaskData[x0] = max_index;
465                             }
466                             else
467                             {
468                                 for (int y = ystart; y < yend; ++y)
469                                     for (int x = xstart; x < xend; ++x)
470                                     {
471                                         const int index = y * inp_width + x;
472                                         float val = srcData[index];
473                                         max_val = std::max(max_val, val);
474                                     }
475
476                                 dstData[x0] = max_val;
477                             }
478                         }
479                     }
480                 else if (poolingType == AVE)
481                 {
482                     for( ; x0 < x1; x0++ )
483                     {
484                         int xstart = x0 * stride_w - pad_w;
485                         int xend = min(xstart + kernel_w, inp_width + pad_w);
486                         int xdelta = xend - xstart;
487                         xstart = max(xstart, 0);
488                         xend = min(xend, inp_width);
489                         float inv_kernel_area = 1.f/(ydelta*xdelta);
490
491 #if CV_SIMD128
492                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
493                         {
494                             v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
495                             v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
496
497                             for (int y = ystart; y < yend; ++y)
498                             {
499                                 for (int x = xstart; x < xend; ++x)
500                                 {
501                                     const int index = y * inp_width + x;
502                                     v_float32x4 v0(srcData[index], srcData[index + stride_w],
503                                                    srcData[index + stride_w*2], srcData[index + stride_w*3]);
504                                     v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
505                                                    srcData[index + stride_w*6], srcData[index + stride_w*7]);
506                                     sum_val0 += v0;
507                                     sum_val1 += v1;
508                                 }
509                             }
510                             v_store(dstData + x0, sum_val0*ikarea);
511                             v_store(dstData + x0 + 4, sum_val1*ikarea);
512                             x0 += 7;
513                         }
514                         else
515 #endif
516                         {
517                             float sum_val = 0.f;
518                             for (int y = ystart; y < yend; ++y)
519                                 for (int x = xstart; x < xend; ++x)
520                                 {
521                                     const int index = y * inp_width + x;
522                                     float val = srcData[index];
523                                     sum_val += val;
524                                 }
525
526                             dstData[x0] = sum_val*inv_kernel_area;
527                         }
528                     }
529                 }
530                 else  // ROI
531                 {
532                     const float *roisData = rois->ptr<float>(n);
533                     int xstartROI = scaleAndRoundRoi(roisData[1], spatialScale);
534                     int xendROI = scaleAndRoundRoi(roisData[3], spatialScale);
535                     int roiWidth = std::max(xendROI - xstartROI + 1, 1);
536                     float roiRatio = (float)roiWidth / width;
537                     for( ; x0 < x1; x0++ )
538                     {
539                         int xstart = xstartROI + x0 * roiRatio;
540                         int xend = xstartROI + std::ceil((x0 + 1) * roiRatio);
541                         xstart = max(xstart, 0);
542                         xend = min(xend, inp_width);
543                         if (xstart >= xend || ystart >= yend)
544                         {
545                             dstData[x0] = 0;
546                             if (compMaxIdx && dstMaskData)
547                                 dstMaskData[x0] = -1;
548                             continue;
549                         }
550                         float max_val = -FLT_MAX;
551                         for (int y = ystart; y < yend; ++y)
552                             for (int x = xstart; x < xend; ++x)
553                             {
554                                 const int index = y * inp_width + x;
555                                 float val = srcData[index];
556                                 max_val = std::max(max_val, val);
557                             }
558                         dstData[x0] = max_val;
559                     }
560                 }
561             }
562         }
563     };
564
565     void maxPooling(Mat &src, Mat &dst, Mat &mask)
566     {
567         const int nstripes = getNumThreads();
568         Mat rois;
569         PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
570     }
571
572     void avePooling(Mat &src, Mat &dst)
573     {
574         const int nstripes = getNumThreads();
575         Mat rois, mask;
576         PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
577     }
578
579     void roiPooling(const Mat &src, const Mat &rois, Mat &dst)
580     {
581         const int nstripes = getNumThreads();
582         Mat mask;
583         PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
584     }
585
586     virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
587     {
588 #ifdef HAVE_HALIDE
589         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
590         const int inWidth = inputBuffer.width();
591         const int inHeight = inputBuffer.height();
592
593         Halide::Var x("x"), y("y"), c("c"), n("n");
594         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
595         Halide::RDom r(0, kernel.width, 0, kernel.height);
596         Halide::Expr kx, ky;
597         if (pad.width || pad.height)
598         {
599             kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
600             ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
601         }
602         else
603         {
604             kx = min(x * stride.width + r.x, inWidth - 1);
605             ky = min(y * stride.height + r.y, inHeight - 1);
606         }
607
608         // Halide::argmax returns tuple (r.x, r.y, max).
609         Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
610
611         // Compute offset from argmax in range [0, kernel_size).
612         Halide::Expr max_index;
613         if (pad.width || pad.height)
614         {
615             max_index = clamp(y * stride.height + res[1] - pad.height,
616                               0, inHeight - 1) * inWidth +
617                         clamp(x * stride.width + res[0] - pad.width,
618                               0, inWidth - 1);
619         }
620         else
621         {
622             max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
623                         min(x * stride.width + res[0], inWidth - 1);
624         }
625         top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
626         return Ptr<BackendNode>(new HalideBackendNode(top));
627 #endif  // HAVE_HALIDE
628         return Ptr<BackendNode>();
629     }
630
631     virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
632     {
633 #ifdef HAVE_HALIDE
634         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
635
636         const int inW = inputBuffer.width(), inH = inputBuffer.height();
637         if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
638         {
639             CV_Error(cv::Error::StsNotImplemented,
640                      "Halide backend for average pooling with partial "
641                      "kernels is not implemented");
642         }
643
644         const float norm = 1.0f / (kernel.width * kernel.height);
645
646         Halide::Var x("x"), y("y"), c("c"), n("n");
647         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
648         Halide::RDom r(0, kernel.width, 0, kernel.height);
649         top(x, y, c, n) = sum(
650             inputBuffer(x * stride.width + r.x,
651                         y * stride.height + r.y, c, n)) * norm;
652         return Ptr<BackendNode>(new HalideBackendNode(top));
653 #endif  // HAVE_HALIDE
654         return Ptr<BackendNode>();
655     }
656
657     virtual void applyHalideScheduler(Ptr<BackendNode>& node,
658                                       const std::vector<Mat*> &inputs,
659                                       const std::vector<Mat> &outputs,
660                                       int targetId) const
661     {
662 #ifdef  HAVE_HALIDE
663         if (targetId != DNN_TARGET_CPU)
664         {
665             Layer::applyHalideScheduler(node, inputs, outputs, targetId);
666             return;
667         }
668         Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
669                     xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
670         Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
671
672         int outW, outH, outC, outN;
673         getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
674
675         if (outW < 8 || outH < 8)
676         {
677             if (outC > 8)
678                 top.split(c, co, ci, 8)
679                    .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
680                    .parallel(tile)
681                    .vectorize(ci);
682             else
683             {
684                 top.fuse(y, c, tile).fuse(n, tile, tile)
685                    .parallel(tile);
686                 if (outW > 1)
687                     top.vectorize(x);
688             }
689         }
690         else
691         {
692             if (outC > 8)
693                 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
694                    .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
695                    .parallel(tile)
696                    .vectorize(xi);
697             else
698                 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
699                    .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
700                    .parallel(tile)
701                    .vectorize(xi);
702         }
703 #endif  // HAVE_HALIDE
704     }
705
706     bool getMemoryShapes(const std::vector<MatShape> &inputs,
707                          const int requiredOutputs,
708                          std::vector<MatShape> &outputs,
709                          std::vector<MatShape> &internals) const
710     {
711         CV_Assert(inputs.size() != 0);
712         Size in(inputs[0][3], inputs[0][2]), out;
713
714         if (globalPooling)
715         {
716             out.height = 1;
717             out.width = 1;
718         }
719         else if (type == ROI)
720         {
721             out.height = pooledSize.height;
722             out.width = pooledSize.width;
723         }
724         else if (padMode.empty())
725         {
726             float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height;
727             float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width;
728             out.height = 1 + (ceilMode ? ceil(height) : floor(height));
729             out.width = 1 + (ceilMode ? ceil(width) : floor(width));
730
731             if (pad.height || pad.width)
732             {
733                 // If we have padding, ensure that the last pooling starts strictly
734                 // inside the image (instead of at the padding); otherwise clip the last.
735                 if ((out.height - 1) * stride.height >= in.height + pad.height)
736                     --out.height;
737                 if ((out.width - 1) * stride.width >= in.width + pad.width)
738                     --out.width;
739                 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
740                 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
741             }
742         }
743         else
744         {
745             getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
746         }
747
748         int dims[] = {inputs[0][0], inputs[0][1], out.height, out.width};
749         if (type == ROI)
750         {
751             CV_Assert(inputs.size() == 2);
752             dims[0] = inputs[1][0];  // Number of proposals;
753         }
754         outputs.assign(type == MAX ? 2 : 1, shape(dims));
755         return false;
756     }
757
758     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
759                            const std::vector<MatShape> &outputs) const
760     {
761         (void)inputs; // suppress unused variable warning
762         long flops = 0;
763
764         for(int i = 0; i < outputs.size(); i++)
765         {
766             if (type == MAX)
767             {
768                 if (i%2 == 0)
769                     flops += total(outputs[i])*kernel.area();
770             }
771             else
772             {
773                 flops += total(outputs[i])*(kernel.area() + 1);
774             }
775         }
776         return flops;
777     }
778 private:
779     enum Type
780     {
781         MAX,
782         AVE,
783         STOCHASTIC,
784         ROI
785     };
786 };
787
788 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
789 {
790     return Ptr<PoolingLayer>(new PoolingLayerImpl(params));
791 }
792
793 }
794 }