Merge pull request #9779 from Lightricks:feature/assetslibrary-remove-link
[platform/upstream/opencv.git] / modules / dnn / src / layers / pooling_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "op_halide.hpp"
47 #include "opencl_kernels_dnn.hpp"
48 #include <float.h>
49 #include <algorithm>
50 using std::max;
51 using std::min;
52 #ifdef HAVE_OPENCL
53 using namespace cv::dnn::ocl4dnn;
54 #endif
55
56 namespace cv
57 {
58 namespace dnn
59 {
60
61 class PoolingLayerImpl : public PoolingLayer
62 {
63 public:
64     PoolingLayerImpl(const LayerParams& params)
65     {
66         type = PoolingLayer::MAX;
67         computeMaxIdx = true;
68
69         if (params.has("pool"))
70         {
71             String pool = params.get<String>("pool").toLowerCase();
72             if (pool == "max")
73                 type = PoolingLayer::MAX;
74             else if (pool == "ave")
75                 type = PoolingLayer::AVE;
76             else if (pool == "stochastic")
77                 type = PoolingLayer::STOCHASTIC;
78             else
79                 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
80         }
81
82         getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
83                                pad.height, pad.width, stride.height, stride.width, padMode);
84         setParamsFrom(params);
85         ceilMode = params.get<bool>("ceil_mode", true);
86     }
87
88 #ifdef HAVE_OPENCL
89     Ptr<OCL4DNNPool<float> > poolOp;
90 #endif
91
92     void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
93     {
94         CV_Assert(inputs.size() == 1);
95
96         cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
97                 out(outputs[0].size[3], outputs[0].size[2]);
98
99         if(globalPooling)
100         {
101             kernel = inp;
102         }
103
104         getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad);
105     }
106
107     virtual bool supportBackend(int backendId)
108     {
109         return backendId == DNN_BACKEND_DEFAULT ||
110                backendId == DNN_BACKEND_HALIDE && haveHalide() &&
111                (type == PoolingLayer::MAX ||
112                 type == PoolingLayer::AVE && !pad.width && !pad.height);
113     }
114
115 #ifdef HAVE_OPENCL
116     bool forward_ocl(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
117     {
118         if (poolOp.empty())
119         {
120             OCL4DNNPoolConfig config;
121
122             config.in_shape = shape(*inputs[0]);
123             config.out_shape = shape(outputs[0]);
124             config.kernel = kernel;
125             config.pad = pad;
126             config.stride = stride;
127             config.channels = inputs[0]->size[1];
128             config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
129                                 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
130                                                LIBDNN_POOLING_METHOD_STO);
131             poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
132         }
133
134         for (size_t ii = 0; ii < inputs.size(); ii++)
135         {
136             UMat inpMat, outMat, maskMat;
137
138             inpMat = inputs[ii]->getUMat(ACCESS_READ);
139
140             if (type == MAX)
141             {
142                 outMat = outputs[2 * ii].getUMat(ACCESS_WRITE);
143                 maskMat = outputs[2 * ii + 1].getUMat(ACCESS_WRITE);
144             } else {
145                 outMat = outputs[ii].getUMat(ACCESS_WRITE);
146                 maskMat = UMat();
147             }
148
149             CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
150
151             if (!poolOp->Forward(inpMat, outMat, maskMat))
152                 return false;
153         }
154
155         return true;
156     }
157 #endif
158
159     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
160     {
161         CV_TRACE_FUNCTION();
162         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
163
164         CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
165                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
166                    forward_ocl(inputs, outputs, internals))
167
168         for (size_t ii = 0; ii < inputs.size(); ii++)
169         {
170             switch (type)
171             {
172                 case MAX:
173                     maxPooling(*inputs[ii], outputs[2 * ii], outputs[2 * ii + 1]);
174                     break;
175                 case AVE:
176                     avePooling(*inputs[ii], outputs[ii]);
177                     break;
178                 default:
179                     CV_Error(Error::StsNotImplemented, "Not implemented");
180                     break;
181             }
182         }
183     }
184
185     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
186     {
187         if (type == PoolingLayer::MAX)
188             return initMaxPoolingHalide(inputs);
189         else if (type == PoolingLayer::AVE)
190             return initAvePoolingHalide(inputs);
191         else
192             return Ptr<BackendNode>();
193     }
194
195     class PoolingInvoker : public ParallelLoopBody
196     {
197     public:
198         const Mat* src;
199         Mat *dst, *mask;
200         Size kernel, stride, pad;
201         int nstripes;
202         bool computeMaxIdx;
203         std::vector<int> ofsbuf;
204         int poolingType;
205
206         PoolingInvoker() : src(0), dst(0), mask(0), nstripes(0), computeMaxIdx(0), poolingType(PoolingLayer::MAX) {}
207
208         static void run(const Mat& src, Mat& dst, Mat& mask, Size kernel,
209                         Size stride, Size pad, int poolingType,
210                         bool computeMaxIdx, int nstripes)
211         {
212             CV_Assert(src.isContinuous() && dst.isContinuous() &&
213                       src.type() == CV_32F && src.type() == dst.type() &&
214                       src.dims == 4 && dst.dims == 4 &&
215                       src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
216                       (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
217
218             PoolingInvoker p;
219
220             p.src = &src;
221             p.dst = &dst;
222             p.mask = &mask;
223             p.kernel = kernel;
224             p.stride = stride;
225             p.pad = pad;
226             p.nstripes = nstripes;
227             p.computeMaxIdx = computeMaxIdx;
228             p.poolingType = poolingType;
229
230             if( !computeMaxIdx )
231             {
232                 p.ofsbuf.resize(kernel.width*kernel.height);
233                 for( int i = 0; i < kernel.height; i++ )
234                     for( int j = 0; j < kernel.width; j++ )
235                         p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
236             }
237
238             parallel_for_(Range(0, nstripes), p, nstripes);
239         }
240
241         void operator()(const Range& r) const
242         {
243             int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
244             int inp_width = src->size[3], inp_height = src->size[2];
245             size_t total = dst->total();
246             size_t stripeSize = (total + nstripes - 1)/nstripes;
247             size_t stripeStart = r.start*stripeSize;
248             size_t stripeEnd = std::min(r.end*stripeSize, total);
249             int kernel_w = kernel.width, kernel_h = kernel.height;
250             int pad_w = pad.width, pad_h = pad.height;
251             int stride_w = stride.width, stride_h = stride.height;
252             bool compMaxIdx = computeMaxIdx;
253
254 #if CV_SIMD128
255             const int* ofsptr = &ofsbuf[0];
256             v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
257             v_float32x4 ones = v_setall_f32(1.f);
258             v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
259 #endif
260
261             for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
262             {
263                 size_t ofs = ofs0;
264                 int x0 = (int)(ofs % width);
265                 ofs /= width;
266                 int y0 = (int)(ofs % height);
267                 ofs /= height;
268                 int c = (int)(ofs % channels);
269                 int n = (int)(ofs / channels);
270                 int ystart = y0 * stride_h - pad_h;
271                 int yend = min(ystart + kernel_h, inp_height + pad_h);
272                 int ydelta = yend - ystart;
273                 ystart = max(ystart, 0);
274                 yend = min(yend, inp_height);
275                 const float *srcData = src->ptr<float>(n, c);
276                 float *dstData = dst->ptr<float>(n, c, y0);
277                 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
278
279                 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
280                 ofs0 += delta;
281                 int x1 = x0 + delta;
282
283                 if( poolingType == PoolingLayer::MAX )
284                     for( ; x0 < x1; x0++ )
285                     {
286                         int xstart = x0 * stride_w - pad_w;
287                         int xend = min(xstart + kernel_w, inp_width);
288                         xstart = max(xstart, 0);
289
290 #if CV_SIMD128
291                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
292                         {
293                             if( compMaxIdx )
294                             {
295                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
296                                 v_float32x4 max_val1 = max_val0;
297                                 v_float32x4 max_idx0 = v_setall_f32(-1.f);
298                                 v_float32x4 max_idx1 = max_idx0;
299                                 int index0 = ystart * inp_width + xstart;
300                                 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
301                                 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
302
303                                 for (int y = ystart; y < yend; ++y)
304                                 {
305                                     for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
306                                     {
307                                         const int index = y * inp_width + x;
308                                         v_float32x4 v0(srcData[index], srcData[index + stride_w],
309                                                        srcData[index + stride_w*2], srcData[index + stride_w*3]);
310                                         v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
311                                                        srcData[index + stride_w*6], srcData[index + stride_w*7]);
312                                         max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
313                                         max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
314                                         max_val0 = v_max(max_val0, v0);
315                                         max_val1 = v_max(max_val1, v1);
316                                     }
317                                     idx0 += idx_delta;
318                                     idx1 += idx_delta;
319                                 }
320                                 v_store(dstData + x0, max_val0);
321                                 v_store(dstData + x0 + 4, max_val1);
322                                 if (dstMaskData)
323                                 {
324                                     v_store(dstMaskData + x0, max_idx0);
325                                     v_store(dstMaskData + x0 + 4, max_idx1);
326                                 }
327                                 x0 += 7;
328                             }
329                             else
330                             {
331                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
332                                 v_float32x4 max_val1 = max_val0;
333
334                                 if( yend - ystart == kernel_h )
335                                 {
336                                     const float* srcData1 = srcData + ystart*inp_width + xstart;
337                                     if( stride_w == 1 )
338                                         for (int k = 0; k < kernel_w*kernel_h; k++)
339                                         {
340                                             int index = ofsptr[k];
341                                             v_float32x4 v0 = v_load(srcData1 + index);
342                                             v_float32x4 v1 = v_load(srcData1 + index + 4);
343                                             max_val0 = v_max(max_val0, v0);
344                                             max_val1 = v_max(max_val1, v1);
345                                         }
346 #if CV_SSE2
347                                     else if( stride_w == 2 )
348                                         for (int k = 0; k < kernel_w*kernel_h; k++)
349                                         {
350                                             int index = ofsptr[k];
351                                             v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
352                                             v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
353                                             v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
354                                             v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
355                                             max_val0 = v_max(max_val0, v0);
356                                             max_val1 = v_max(max_val1, v1);
357                                         }
358 #endif
359                                     else
360                                         for (int k = 0; k < kernel_w*kernel_h; k++)
361                                         {
362                                             int index = ofsptr[k];
363                                             v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
364                                                            srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
365                                             v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
366                                                            srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
367                                             max_val0 = v_max(max_val0, v0);
368                                             max_val1 = v_max(max_val1, v1);
369                                         }
370                                 }
371                                 else
372                                 {
373                                     for (int y = ystart; y < yend; ++y)
374                                     {
375                                         for (int x = xstart; x < xend; ++x)
376                                         {
377                                             const int index = y * inp_width + x;
378                                             v_float32x4 v0(srcData[index], srcData[index + stride_w],
379                                                            srcData[index + stride_w*2], srcData[index + stride_w*3]);
380                                             v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
381                                                            srcData[index + stride_w*6], srcData[index + stride_w*7]);
382                                             max_val0 = v_max(max_val0, v0);
383                                             max_val1 = v_max(max_val1, v1);
384                                         }
385                                     }
386                                 }
387                                 v_store(dstData + x0, max_val0);
388                                 v_store(dstData + x0 + 4, max_val1);
389                                 x0 += 7;
390                             }
391                         }
392                         else
393 #endif
394                         {
395                             float max_val = -FLT_MAX;
396                             if( compMaxIdx )
397                             {
398                                 int max_index = -1;
399                                 for (int y = ystart; y < yend; ++y)
400                                     for (int x = xstart; x < xend; ++x)
401                                     {
402                                         const int index = y * inp_width + x;
403                                         float val = srcData[index];
404                                         if (val > max_val)
405                                         {
406                                             max_val = val;
407                                             max_index = index;
408                                         }
409                                     }
410
411                                 dstData[x0] = max_val;
412                                 if (dstMaskData)
413                                     dstMaskData[x0] = max_index;
414                             }
415                             else
416                             {
417                                 for (int y = ystart; y < yend; ++y)
418                                     for (int x = xstart; x < xend; ++x)
419                                     {
420                                         const int index = y * inp_width + x;
421                                         float val = srcData[index];
422                                         max_val = std::max(max_val, val);
423                                     }
424
425                                 dstData[x0] = max_val;
426                             }
427                         }
428                     }
429                 else
430                 {
431                     for( ; x0 < x1; x0++ )
432                     {
433                         int xstart = x0 * stride_w - pad_w;
434                         int xend = min(xstart + kernel_w, inp_width + pad_w);
435                         int xdelta = xend - xstart;
436                         xstart = max(xstart, 0);
437                         xend = min(xend, inp_width);
438                         float inv_kernel_area = 1.f/(ydelta*xdelta);
439
440 #if CV_SIMD128
441                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
442                         {
443                             v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
444                             v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
445
446                             for (int y = ystart; y < yend; ++y)
447                             {
448                                 for (int x = xstart; x < xend; ++x)
449                                 {
450                                     const int index = y * inp_width + x;
451                                     v_float32x4 v0(srcData[index], srcData[index + stride_w],
452                                                    srcData[index + stride_w*2], srcData[index + stride_w*3]);
453                                     v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
454                                                    srcData[index + stride_w*6], srcData[index + stride_w*7]);
455                                     sum_val0 += v0;
456                                     sum_val1 += v1;
457                                 }
458                             }
459                             v_store(dstData + x0, sum_val0*ikarea);
460                             v_store(dstData + x0 + 4, sum_val1*ikarea);
461                             x0 += 7;
462                         }
463                         else
464 #endif
465                         {
466                             float sum_val = 0.f;
467                             for (int y = ystart; y < yend; ++y)
468                                 for (int x = xstart; x < xend; ++x)
469                                 {
470                                     const int index = y * inp_width + x;
471                                     float val = srcData[index];
472                                     sum_val += val;
473                                 }
474
475                             dstData[x0] = sum_val*inv_kernel_area;
476                         }
477                     }
478                 }
479             }
480         }
481     };
482
483     void maxPooling(Mat &src, Mat &dst, Mat &mask)
484     {
485         const int nstripes = getNumThreads();
486         PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
487     }
488
489     void avePooling(Mat &src, Mat &dst)
490     {
491         const int nstripes = getNumThreads();
492         Mat mask;
493         PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
494     }
495
496     virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
497     {
498 #ifdef HAVE_HALIDE
499         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
500         const int inWidth = inputBuffer.width();
501         const int inHeight = inputBuffer.height();
502
503         Halide::Var x("x"), y("y"), c("c"), n("n");
504         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
505         Halide::RDom r(0, kernel.width, 0, kernel.height);
506         Halide::Expr kx, ky;
507         if (pad.width || pad.height)
508         {
509             kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
510             ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
511         }
512         else
513         {
514             kx = min(x * stride.width + r.x, inWidth - 1);
515             ky = min(y * stride.height + r.y, inHeight - 1);
516         }
517
518         // Halide::argmax returns tuple (r.x, r.y, max).
519         Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
520
521         // Compute offset from argmax in range [0, kernel_size).
522         Halide::Expr max_index;
523         if (pad.width || pad.height)
524         {
525             max_index = clamp(y * stride.height + res[1] - pad.height,
526                               0, inHeight - 1) * inWidth +
527                         clamp(x * stride.width + res[0] - pad.width,
528                               0, inWidth - 1);
529         }
530         else
531         {
532             max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
533                         min(x * stride.width + res[0], inWidth - 1);
534         }
535         top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
536         return Ptr<BackendNode>(new HalideBackendNode(top));
537 #endif  // HAVE_HALIDE
538         return Ptr<BackendNode>();
539     }
540
541     virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
542     {
543 #ifdef HAVE_HALIDE
544         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
545
546         const int inW = inputBuffer.width(), inH = inputBuffer.height();
547         if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
548         {
549             CV_Error(cv::Error::StsNotImplemented,
550                      "Halide backend for average pooling with partial "
551                      "kernels is not implemented");
552         }
553
554         const float norm = 1.0f / (kernel.width * kernel.height);
555
556         Halide::Var x("x"), y("y"), c("c"), n("n");
557         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
558         Halide::RDom r(0, kernel.width, 0, kernel.height);
559         top(x, y, c, n) = sum(
560             inputBuffer(x * stride.width + r.x,
561                         y * stride.height + r.y, c, n)) * norm;
562         return Ptr<BackendNode>(new HalideBackendNode(top));
563 #endif  // HAVE_HALIDE
564         return Ptr<BackendNode>();
565     }
566
567     virtual void applyHalideScheduler(Ptr<BackendNode>& node,
568                                       const std::vector<Mat*> &inputs,
569                                       const std::vector<Mat> &outputs,
570                                       int targetId) const
571     {
572 #ifdef  HAVE_HALIDE
573         if (targetId != DNN_TARGET_CPU)
574         {
575             Layer::applyHalideScheduler(node, inputs, outputs, targetId);
576             return;
577         }
578         Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
579                     xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
580         Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
581
582         int outW, outH, outC, outN;
583         getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
584
585         if (outW < 8 || outH < 8)
586         {
587             if (outC > 8)
588                 top.split(c, co, ci, 8)
589                    .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
590                    .parallel(tile)
591                    .vectorize(ci);
592             else
593             {
594                 top.fuse(y, c, tile).fuse(n, tile, tile)
595                    .parallel(tile);
596                 if (outW > 1)
597                     top.vectorize(x);
598             }
599         }
600         else
601         {
602             if (outC > 8)
603                 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
604                    .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
605                    .parallel(tile)
606                    .vectorize(xi);
607             else
608                 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
609                    .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
610                    .parallel(tile)
611                    .vectorize(xi);
612         }
613 #endif  // HAVE_HALIDE
614     }
615
616     bool getMemoryShapes(const std::vector<MatShape> &inputs,
617                          const int requiredOutputs,
618                          std::vector<MatShape> &outputs,
619                          std::vector<MatShape> &internals) const
620     {
621         CV_Assert(inputs.size() != 0);
622         Size in(inputs[0][3], inputs[0][2]), out;
623
624         if (globalPooling)
625         {
626             out.height = 1;
627             out.width = 1;
628         }
629         else if (padMode.empty())
630         {
631             float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height;
632             float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width;
633             out.height = 1 + (ceilMode ? ceil(height) : floor(height));
634             out.width = 1 + (ceilMode ? ceil(width) : floor(width));
635
636             if (pad.height || pad.width)
637             {
638                 // If we have padding, ensure that the last pooling starts strictly
639                 // inside the image (instead of at the padding); otherwise clip the last.
640                 if ((out.height - 1) * stride.height >= in.height + pad.height)
641                     --out.height;
642                 if ((out.width - 1) * stride.width >= in.width + pad.width)
643                     --out.width;
644                 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
645                 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
646             }
647         }
648         else
649         {
650             getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
651         }
652
653         outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
654         for (size_t i = 0; i < inputs.size(); i++)
655         {
656             size_t index = type == MAX ? 2*i : i;
657             int dims[] = {inputs[i][0], inputs[i][1], out.height, out.width};
658             outputs[index] = shape(dims);
659
660             if (type == MAX)
661                 outputs[index + 1] = shape(dims);
662         }
663
664         return false;
665     }
666
667     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
668                            const std::vector<MatShape> &outputs) const
669     {
670         (void)inputs; // suppress unused variable warning
671         long flops = 0;
672
673         for(int i = 0; i < outputs.size(); i++)
674         {
675             if (type == MAX)
676             {
677                 if (i%2 == 0)
678                     flops += total(outputs[i])*kernel.area();
679             }
680             else
681             {
682                 flops += total(outputs[i])*(kernel.area() + 1);
683             }
684         }
685         return flops;
686     }
687 };
688
689 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
690 {
691     return Ptr<PoolingLayer>(new PoolingLayerImpl(params));
692 }
693
694 }
695 }