modules/dnn/src/layers/pooling_layer.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "../precomp.hpp"
  44 #include "layers_common.hpp"
  45 #include "opencv2/core/hal/intrin.hpp"
  46 #include "op_halide.hpp"
  47 #include "opencl_kernels_dnn.hpp"
  48 #include <float.h>
  49 #include <algorithm>
  50 using std::max;
  51 using std::min;
  52 #ifdef HAVE_OPENCL
  53 using namespace cv::dnn::ocl4dnn;
  54 #endif
  55
  56 namespace cv
  57 {
  58 namespace dnn
  59 {
  60
  61 class PoolingLayerImpl : public PoolingLayer
  62 {
  63 public:
  64     PoolingLayerImpl(const LayerParams& params)
  65     {
  66         type = PoolingLayer::MAX;
  67         computeMaxIdx = true;
  68
  69         if (params.has("pool"))
  70         {
  71             String pool = params.get<String>("pool").toLowerCase();
  72             if (pool == "max")
  73                 type = PoolingLayer::MAX;
  74             else if (pool == "ave")
  75                 type = PoolingLayer::AVE;
  76             else if (pool == "stochastic")
  77                 type = PoolingLayer::STOCHASTIC;
  78             else
  79                 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
  80         }
  81
  82         getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
  83                                pad.height, pad.width, stride.height, stride.width, padMode);
  84         setParamsFrom(params);
  85         ceilMode = params.get<bool>("ceil_mode", true);
  86     }
  87
  88 #ifdef HAVE_OPENCL
  89     Ptr<OCL4DNNPool<float> > poolOp;
  90 #endif
  91
  92     void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
  93     {
  94         CV_Assert(inputs.size() == 1);
  95
  96         cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
  97                 out(outputs[0].size[3], outputs[0].size[2]);
  98
  99         if(globalPooling)
 100         {
 101             kernel = inp;
 102         }
 103
 104         getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad);
 105     }
 106
 107     virtual bool supportBackend(int backendId)
 108     {
 109         return backendId == DNN_BACKEND_DEFAULT ||
 110                backendId == DNN_BACKEND_HALIDE && haveHalide() &&
 111                (type == PoolingLayer::MAX ||
 112                 type == PoolingLayer::AVE && !pad.width && !pad.height);
 113     }
 114
 115 #ifdef HAVE_OPENCL
 116     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
 117     {
 118         std::vector<UMat> inputs;
 119         std::vector<UMat> outputs;
 120
 121         inps.getUMatVector(inputs);
 122         outs.getUMatVector(outputs);
 123
 124         if (poolOp.empty())
 125         {
 126             OCL4DNNPoolConfig config;
 127
 128             config.in_shape = shape(inputs[0]);
 129             config.out_shape = shape(outputs[0]);
 130             config.kernel = kernel;
 131             config.pad = pad;
 132             config.stride = stride;
 133             config.channels = inputs[0].size[1];
 134             config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
 135                                 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
 136                                                LIBDNN_POOLING_METHOD_STO);
 137             poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
 138         }
 139
 140         for (size_t ii = 0; ii < inputs.size(); ii++)
 141         {
 142             UMat& inpMat = inputs[ii];
 143             int out_index = (type == MAX) ? 2 : 1;
 144             UMat& outMat = outputs[out_index * ii];
 145             UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat();
 146
 147             CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
 148
 149             if (!poolOp->Forward(inpMat, outMat, maskMat))
 150                 return false;
 151         }
 152
 153         return true;
 154     }
 155 #endif
 156
 157     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
 158     {
 159         CV_TRACE_FUNCTION();
 160         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 161
 162         CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
 163                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
 164                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 165
 166         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
 167     }
 168
 169     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
 170     {
 171         CV_TRACE_FUNCTION();
 172         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 173
 174         for (size_t ii = 0; ii < inputs.size(); ii++)
 175         {
 176             switch (type)
 177             {
 178                 case MAX:
 179                     maxPooling(*inputs[ii], outputs[2 * ii], outputs[2 * ii + 1]);
 180                     break;
 181                 case AVE:
 182                     avePooling(*inputs[ii], outputs[ii]);
 183                     break;
 184                 default:
 185                     CV_Error(Error::StsNotImplemented, "Not implemented");
 186                     break;
 187             }
 188         }
 189     }
 190
 191     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
 192     {
 193         if (type == PoolingLayer::MAX)
 194             return initMaxPoolingHalide(inputs);
 195         else if (type == PoolingLayer::AVE)
 196             return initAvePoolingHalide(inputs);
 197         else
 198             return Ptr<BackendNode>();
 199     }
 200
 201     class PoolingInvoker : public ParallelLoopBody
 202     {
 203     public:
 204         const Mat* src;
 205         Mat *dst, *mask;
 206         Size kernel, stride, pad;
 207         int nstripes;
 208         bool computeMaxIdx;
 209         std::vector<int> ofsbuf;
 210         int poolingType;
 211
 212         PoolingInvoker() : src(0), dst(0), mask(0), nstripes(0), computeMaxIdx(0), poolingType(PoolingLayer::MAX) {}
 213
 214         static void run(const Mat& src, Mat& dst, Mat& mask, Size kernel,
 215                         Size stride, Size pad, int poolingType,
 216                         bool computeMaxIdx, int nstripes)
 217         {
 218             CV_Assert(src.isContinuous() && dst.isContinuous() &&
 219                       src.type() == CV_32F && src.type() == dst.type() &&
 220                       src.dims == 4 && dst.dims == 4 &&
 221                       src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
 222                       (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
 223
 224             PoolingInvoker p;
 225
 226             p.src = &src;
 227             p.dst = &dst;
 228             p.mask = &mask;
 229             p.kernel = kernel;
 230             p.stride = stride;
 231             p.pad = pad;
 232             p.nstripes = nstripes;
 233             p.computeMaxIdx = computeMaxIdx;
 234             p.poolingType = poolingType;
 235
 236             if( !computeMaxIdx )
 237             {
 238                 p.ofsbuf.resize(kernel.width*kernel.height);
 239                 for( int i = 0; i < kernel.height; i++ )
 240                     for( int j = 0; j < kernel.width; j++ )
 241                         p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
 242             }
 243
 244             parallel_for_(Range(0, nstripes), p, nstripes);
 245         }
 246
 247         void operator()(const Range& r) const
 248         {
 249             int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
 250             int inp_width = src->size[3], inp_height = src->size[2];
 251             size_t total = dst->total();
 252             size_t stripeSize = (total + nstripes - 1)/nstripes;
 253             size_t stripeStart = r.start*stripeSize;
 254             size_t stripeEnd = std::min(r.end*stripeSize, total);
 255             int kernel_w = kernel.width, kernel_h = kernel.height;
 256             int pad_w = pad.width, pad_h = pad.height;
 257             int stride_w = stride.width, stride_h = stride.height;
 258             bool compMaxIdx = computeMaxIdx;
 259
 260 #if CV_SIMD128
 261             const int* ofsptr = &ofsbuf[0];
 262             v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
 263             v_float32x4 ones = v_setall_f32(1.f);
 264             v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
 265 #endif
 266
 267             for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
 268             {
 269                 size_t ofs = ofs0;
 270                 int x0 = (int)(ofs % width);
 271                 ofs /= width;
 272                 int y0 = (int)(ofs % height);
 273                 ofs /= height;
 274                 int c = (int)(ofs % channels);
 275                 int n = (int)(ofs / channels);
 276                 int ystart = y0 * stride_h - pad_h;
 277                 int yend = min(ystart + kernel_h, inp_height + pad_h);
 278                 int ydelta = yend - ystart;
 279                 ystart = max(ystart, 0);
 280                 yend = min(yend, inp_height);
 281                 const float *srcData = src->ptr<float>(n, c);
 282                 float *dstData = dst->ptr<float>(n, c, y0);
 283                 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
 284
 285                 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
 286                 ofs0 += delta;
 287                 int x1 = x0 + delta;
 288
 289                 if( poolingType == PoolingLayer::MAX )
 290                     for( ; x0 < x1; x0++ )
 291                     {
 292                         int xstart = x0 * stride_w - pad_w;
 293                         int xend = min(xstart + kernel_w, inp_width);
 294                         xstart = max(xstart, 0);
 295
 296 #if CV_SIMD128
 297                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
 298                         {
 299                             if( compMaxIdx )
 300                             {
 301                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
 302                                 v_float32x4 max_val1 = max_val0;
 303                                 v_float32x4 max_idx0 = v_setall_f32(-1.f);
 304                                 v_float32x4 max_idx1 = max_idx0;
 305                                 int index0 = ystart * inp_width + xstart;
 306                                 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
 307                                 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
 308
 309                                 for (int y = ystart; y < yend; ++y)
 310                                 {
 311                                     for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
 312                                     {
 313                                         const int index = y * inp_width + x;
 314                                         v_float32x4 v0(srcData[index], srcData[index + stride_w],
 315                                                        srcData[index + stride_w*2], srcData[index + stride_w*3]);
 316                                         v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
 317                                                        srcData[index + stride_w*6], srcData[index + stride_w*7]);
 318                                         max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
 319                                         max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
 320                                         max_val0 = v_max(max_val0, v0);
 321                                         max_val1 = v_max(max_val1, v1);
 322                                     }
 323                                     idx0 += idx_delta;
 324                                     idx1 += idx_delta;
 325                                 }
 326                                 v_store(dstData + x0, max_val0);
 327                                 v_store(dstData + x0 + 4, max_val1);
 328                                 if (dstMaskData)
 329                                 {
 330                                     v_store(dstMaskData + x0, max_idx0);
 331                                     v_store(dstMaskData + x0 + 4, max_idx1);
 332                                 }
 333                                 x0 += 7;
 334                             }
 335                             else
 336                             {
 337                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
 338                                 v_float32x4 max_val1 = max_val0;
 339
 340                                 if( yend - ystart == kernel_h )
 341                                 {
 342                                     const float* srcData1 = srcData + ystart*inp_width + xstart;
 343                                     if( stride_w == 1 )
 344                                         for (int k = 0; k < kernel_w*kernel_h; k++)
 345                                         {
 346                                             int index = ofsptr[k];
 347                                             v_float32x4 v0 = v_load(srcData1 + index);
 348                                             v_float32x4 v1 = v_load(srcData1 + index + 4);
 349                                             max_val0 = v_max(max_val0, v0);
 350                                             max_val1 = v_max(max_val1, v1);
 351                                         }
 352 #if CV_SSE2
 353                                     else if( stride_w == 2 )
 354                                         for (int k = 0; k < kernel_w*kernel_h; k++)
 355                                         {
 356                                             int index = ofsptr[k];
 357                                             v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
 358                                             v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
 359                                             v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
 360                                             v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
 361                                             max_val0 = v_max(max_val0, v0);
 362                                             max_val1 = v_max(max_val1, v1);
 363                                         }
 364 #endif
 365                                     else
 366                                         for (int k = 0; k < kernel_w*kernel_h; k++)
 367                                         {
 368                                             int index = ofsptr[k];
 369                                             v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
 370                                                            srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
 371                                             v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
 372                                                            srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
 373                                             max_val0 = v_max(max_val0, v0);
 374                                             max_val1 = v_max(max_val1, v1);
 375                                         }
 376                                 }
 377                                 else
 378                                 {
 379                                     for (int y = ystart; y < yend; ++y)
 380                                     {
 381                                         for (int x = xstart; x < xend; ++x)
 382                                         {
 383                                             const int index = y * inp_width + x;
 384                                             v_float32x4 v0(srcData[index], srcData[index + stride_w],
 385                                                            srcData[index + stride_w*2], srcData[index + stride_w*3]);
 386                                             v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
 387                                                            srcData[index + stride_w*6], srcData[index + stride_w*7]);
 388                                             max_val0 = v_max(max_val0, v0);
 389                                             max_val1 = v_max(max_val1, v1);
 390                                         }
 391                                     }
 392                                 }
 393                                 v_store(dstData + x0, max_val0);
 394                                 v_store(dstData + x0 + 4, max_val1);
 395                                 x0 += 7;
 396                             }
 397                         }
 398                         else
 399 #endif
 400                         {
 401                             float max_val = -FLT_MAX;
 402                             if( compMaxIdx )
 403                             {
 404                                 int max_index = -1;
 405                                 for (int y = ystart; y < yend; ++y)
 406                                     for (int x = xstart; x < xend; ++x)
 407                                     {
 408                                         const int index = y * inp_width + x;
 409                                         float val = srcData[index];
 410                                         if (val > max_val)
 411                                         {
 412                                             max_val = val;
 413                                             max_index = index;
 414                                         }
 415                                     }
 416
 417                                 dstData[x0] = max_val;
 418                                 if (dstMaskData)
 419                                     dstMaskData[x0] = max_index;
 420                             }
 421                             else
 422                             {
 423                                 for (int y = ystart; y < yend; ++y)
 424                                     for (int x = xstart; x < xend; ++x)
 425                                     {
 426                                         const int index = y * inp_width + x;
 427                                         float val = srcData[index];
 428                                         max_val = std::max(max_val, val);
 429                                     }
 430
 431                                 dstData[x0] = max_val;
 432                             }
 433                         }
 434                     }
 435                 else
 436                 {
 437                     for( ; x0 < x1; x0++ )
 438                     {
 439                         int xstart = x0 * stride_w - pad_w;
 440                         int xend = min(xstart + kernel_w, inp_width + pad_w);
 441                         int xdelta = xend - xstart;
 442                         xstart = max(xstart, 0);
 443                         xend = min(xend, inp_width);
 444                         float inv_kernel_area = 1.f/(ydelta*xdelta);
 445
 446 #if CV_SIMD128
 447                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
 448                         {
 449                             v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
 450                             v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
 451
 452                             for (int y = ystart; y < yend; ++y)
 453                             {
 454                                 for (int x = xstart; x < xend; ++x)
 455                                 {
 456                                     const int index = y * inp_width + x;
 457                                     v_float32x4 v0(srcData[index], srcData[index + stride_w],
 458                                                    srcData[index + stride_w*2], srcData[index + stride_w*3]);
 459                                     v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
 460                                                    srcData[index + stride_w*6], srcData[index + stride_w*7]);
 461                                     sum_val0 += v0;
 462                                     sum_val1 += v1;
 463                                 }
 464                             }
 465                             v_store(dstData + x0, sum_val0*ikarea);
 466                             v_store(dstData + x0 + 4, sum_val1*ikarea);
 467                             x0 += 7;
 468                         }
 469                         else
 470 #endif
 471                         {
 472                             float sum_val = 0.f;
 473                             for (int y = ystart; y < yend; ++y)
 474                                 for (int x = xstart; x < xend; ++x)
 475                                 {
 476                                     const int index = y * inp_width + x;
 477                                     float val = srcData[index];
 478                                     sum_val += val;
 479                                 }
 480
 481                             dstData[x0] = sum_val*inv_kernel_area;
 482                         }
 483                     }
 484                 }
 485             }
 486         }
 487     };
 488
 489     void maxPooling(Mat &src, Mat &dst, Mat &mask)
 490     {
 491         const int nstripes = getNumThreads();
 492         PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
 493     }
 494
 495     void avePooling(Mat &src, Mat &dst)
 496     {
 497         const int nstripes = getNumThreads();
 498         Mat mask;
 499         PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
 500     }
 501
 502     virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
 503     {
 504 #ifdef HAVE_HALIDE
 505         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
 506         const int inWidth = inputBuffer.width();
 507         const int inHeight = inputBuffer.height();
 508
 509         Halide::Var x("x"), y("y"), c("c"), n("n");
 510         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
 511         Halide::RDom r(0, kernel.width, 0, kernel.height);
 512         Halide::Expr kx, ky;
 513         if (pad.width || pad.height)
 514         {
 515             kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
 516             ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
 517         }
 518         else
 519         {
 520             kx = min(x * stride.width + r.x, inWidth - 1);
 521             ky = min(y * stride.height + r.y, inHeight - 1);
 522         }
 523
 524         // Halide::argmax returns tuple (r.x, r.y, max).
 525         Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
 526
 527         // Compute offset from argmax in range [0, kernel_size).
 528         Halide::Expr max_index;
 529         if (pad.width || pad.height)
 530         {
 531             max_index = clamp(y * stride.height + res[1] - pad.height,
 532                               0, inHeight - 1) * inWidth +
 533                         clamp(x * stride.width + res[0] - pad.width,
 534                               0, inWidth - 1);
 535         }
 536         else
 537         {
 538             max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
 539                         min(x * stride.width + res[0], inWidth - 1);
 540         }
 541         top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
 542         return Ptr<BackendNode>(new HalideBackendNode(top));
 543 #endif  // HAVE_HALIDE
 544         return Ptr<BackendNode>();
 545     }
 546
 547     virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
 548     {
 549 #ifdef HAVE_HALIDE
 550         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
 551
 552         const int inW = inputBuffer.width(), inH = inputBuffer.height();
 553         if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
 554         {
 555             CV_Error(cv::Error::StsNotImplemented,
 556                      "Halide backend for average pooling with partial "
 557                      "kernels is not implemented");
 558         }
 559
 560         const float norm = 1.0f / (kernel.width * kernel.height);
 561
 562         Halide::Var x("x"), y("y"), c("c"), n("n");
 563         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
 564         Halide::RDom r(0, kernel.width, 0, kernel.height);
 565         top(x, y, c, n) = sum(
 566             inputBuffer(x * stride.width + r.x,
 567                         y * stride.height + r.y, c, n)) * norm;
 568         return Ptr<BackendNode>(new HalideBackendNode(top));
 569 #endif  // HAVE_HALIDE
 570         return Ptr<BackendNode>();
 571     }
 572
 573     virtual void applyHalideScheduler(Ptr<BackendNode>& node,
 574                                       const std::vector<Mat*> &inputs,
 575                                       const std::vector<Mat> &outputs,
 576                                       int targetId) const
 577     {
 578 #ifdef  HAVE_HALIDE
 579         if (targetId != DNN_TARGET_CPU)
 580         {
 581             Layer::applyHalideScheduler(node, inputs, outputs, targetId);
 582             return;
 583         }
 584         Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
 585                     xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
 586         Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
 587
 588         int outW, outH, outC, outN;
 589         getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
 590
 591         if (outW < 8 || outH < 8)
 592         {
 593             if (outC > 8)
 594                 top.split(c, co, ci, 8)
 595                    .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
 596                    .parallel(tile)
 597                    .vectorize(ci);
 598             else
 599             {
 600                 top.fuse(y, c, tile).fuse(n, tile, tile)
 601                    .parallel(tile);
 602                 if (outW > 1)
 603                     top.vectorize(x);
 604             }
 605         }
 606         else
 607         {
 608             if (outC > 8)
 609                 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
 610                    .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
 611                    .parallel(tile)
 612                    .vectorize(xi);
 613             else
 614                 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
 615                    .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
 616                    .parallel(tile)
 617                    .vectorize(xi);
 618         }
 619 #endif  // HAVE_HALIDE
 620     }
 621
 622     bool getMemoryShapes(const std::vector<MatShape> &inputs,
 623                          const int requiredOutputs,
 624                          std::vector<MatShape> &outputs,
 625                          std::vector<MatShape> &internals) const
 626     {
 627         CV_Assert(inputs.size() != 0);
 628         Size in(inputs[0][3], inputs[0][2]), out;
 629
 630         if (globalPooling)
 631         {
 632             out.height = 1;
 633             out.width = 1;
 634         }
 635         else if (padMode.empty())
 636         {
 637             float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height;
 638             float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width;
 639             out.height = 1 + (ceilMode ? ceil(height) : floor(height));
 640             out.width = 1 + (ceilMode ? ceil(width) : floor(width));
 641
 642             if (pad.height || pad.width)
 643             {
 644                 // If we have padding, ensure that the last pooling starts strictly
 645                 // inside the image (instead of at the padding); otherwise clip the last.
 646                 if ((out.height - 1) * stride.height >= in.height + pad.height)
 647                     --out.height;
 648                 if ((out.width - 1) * stride.width >= in.width + pad.width)
 649                     --out.width;
 650                 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
 651                 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
 652             }
 653         }
 654         else
 655         {
 656             getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
 657         }
 658
 659         outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
 660         for (size_t i = 0; i < inputs.size(); i++)
 661         {
 662             size_t index = type == MAX ? 2*i : i;
 663             int dims[] = {inputs[i][0], inputs[i][1], out.height, out.width};
 664             outputs[index] = shape(dims);
 665
 666             if (type == MAX)
 667                 outputs[index + 1] = shape(dims);
 668         }
 669
 670         return false;
 671     }
 672
 673     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
 674                            const std::vector<MatShape> &outputs) const
 675     {
 676         (void)inputs; // suppress unused variable warning
 677         long flops = 0;
 678
 679         for(int i = 0; i < outputs.size(); i++)
 680         {
 681             if (type == MAX)
 682             {
 683                 if (i%2 == 0)
 684                     flops += total(outputs[i])*kernel.area();
 685             }
 686             else
 687             {
 688                 flops += total(outputs[i])*(kernel.area() + 1);
 689             }
 690         }
 691         return flops;
 692     }
 693 };
 694
 695 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
 696 {
 697     return Ptr<PoolingLayer>(new PoolingLayerImpl(params));
 698 }
 699
 700 }
 701 }