modules/dnn/src/layers/pooling_layer.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "../precomp.hpp"
  44 #include "layers_common.hpp"
  45 #include "opencv2/core/hal/intrin.hpp"
  46 #include "../op_cuda.hpp"
  47 #include "../op_halide.hpp"
  48 #include "../op_inf_engine.hpp"
  49 #include "../op_vkcom.hpp"
  50 #include <float.h>
  51 #include <algorithm>
  52 #include <numeric>
  53 using std::max;
  54 using std::min;
  55
  56 #ifdef HAVE_OPENCL
  57 #include "opencl_kernels_dnn.hpp"
  58 using namespace cv::dnn::ocl4dnn;
  59 #endif
  60
  61 #ifdef HAVE_CUDA
  62 #include "../cuda4dnn/primitives/pooling.hpp"
  63 #include "../cuda4dnn/primitives/max_unpooling.hpp"
  64 using namespace cv::dnn::cuda4dnn;
  65 #endif
  66
  67 namespace cv
  68 {
  69 namespace dnn
  70 {
  71 static inline int roundRoiSize(float v)
  72 {
  73     return (int)(v + (v >= 0.f ? 0.5f : -0.5f));
  74 }
  75
  76 class PoolingLayerImpl CV_FINAL : public PoolingLayer
  77 {
  78 public:
  79     PoolingLayerImpl(const LayerParams& params)
  80     {
  81         computeMaxIdx = true;
  82         globalPooling = false;
  83         stride = Size(1, 1);
  84         pad_t = pad_l = pad_b = pad_r = 0;
  85
  86         if (params.has("pool") || params.has("kernel_size") ||
  87             params.has("kernel_w") || params.has("kernel_h"))
  88         {
  89             String pool = toLowerCase(params.get<String>("pool", "max"));
  90             if (pool == "max")
  91                 type = MAX;
  92             else if (pool == "ave")
  93                 type = AVE;
  94             else if (pool == "stochastic")
  95                 type = STOCHASTIC;
  96             else
  97                 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
  98
  99             getPoolingKernelParams(params, kernel_size, globalPooling, pads_begin, pads_end, strides, padMode);
 100             if (kernel_size.size() == 2) {
 101                 kernel = Size(kernel_size[1], kernel_size[0]);
 102                 stride = Size(strides[1], strides[0]);
 103                 pad = Size(pads_begin[1], pads_begin[0]);
 104
 105                 pad_t = pads_begin[0];
 106                 pad_l = pads_begin[1];
 107                 pad_b = pads_end[0];
 108                 pad_r = pads_end[1];
 109             }
 110         }
 111         else if (params.has("pooled_w") || params.has("pooled_h"))
 112         {
 113             type = ROI;
 114             pooledSize.width = params.get<uint32_t>("pooled_w", 1);
 115             pooledSize.height = params.get<uint32_t>("pooled_h", 1);
 116         }
 117         else if (params.has("output_dim") && params.has("group_size"))
 118         {
 119             type = PSROI;
 120             pooledSize.width = params.get<int>("group_size");
 121             pooledSize.height = pooledSize.width;
 122             psRoiOutChannels = params.get<int>("output_dim");
 123         }
 124         else
 125             CV_Error(Error::StsBadArg, "Cannot determine pooling type");
 126         setParamsFrom(params);
 127         ceilMode = params.get<bool>("ceil_mode", true);
 128         spatialScale = params.get<float>("spatial_scale", 1);
 129         avePoolPaddedArea = params.get<bool>("ave_pool_padded_area", true);
 130     }
 131
 132 #ifdef HAVE_OPENCL
 133     Ptr<OCL4DNNPool<float> > poolOp;
 134 #endif
 135
 136     void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
 137     {
 138         std::vector<Mat> inputs, outputs;
 139         inputs_arr.getMatVector(inputs);
 140         outputs_arr.getMatVector(outputs);
 141
 142         CV_Assert(!inputs.empty());
 143
 144         std::vector<int> inp;
 145         std::vector<int> out;
 146         for (int i = 2; i < inputs[0].dims; i++) {
 147             inp.push_back(inputs[0].size[i]);
 148             out.push_back(outputs[0].size[i]);
 149         }
 150         if (globalPooling) {
 151             kernel = Size(inp[1], inp[0]);
 152             kernel_size = std::vector<size_t>(inp.begin(), inp.end());
 153         }
 154
 155         getConvPoolPaddings(inp, kernel_size, strides, padMode, pads_begin, pads_end);
 156         if (pads_begin.size() == 2) {
 157             pad_t = pads_begin[0];
 158             pad_l = pads_begin[1];
 159             pad_b = pads_end[0];
 160             pad_r = pads_end[1];
 161         }
 162
 163 #ifdef HAVE_OPENCL
 164         poolOp.release();
 165 #endif
 166         computeMaxIdx = type == MAX && outputs.size() == 2;
 167     }
 168
 169     virtual bool supportBackend(int backendId) CV_OVERRIDE
 170     {
 171         if (backendId == DNN_BACKEND_CUDA)
 172         {
 173             return type == MAX || type == AVE;
 174         }
 175         else if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
 176         {
 177             if (computeMaxIdx)
 178                 return false;
 179 #ifdef HAVE_INF_ENGINE
 180             if (kernel_size.size() == 3)
 181                 return preferableTarget == DNN_TARGET_CPU;
 182             if (preferableTarget == DNN_TARGET_MYRIAD) {
 183 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
 184                 if (type == MAX && (pad_l == 1 && pad_t == 1) && stride == Size(2, 2) ) {
 185                     return !isMyriadX();
 186                 }
 187 #endif
 188                 return type == MAX || type == AVE;
 189             }
 190             else
 191                 return type != STOCHASTIC;
 192 #else
 193             return false;
 194 #endif
 195         }
 196         else
 197         {
 198             if (kernel_size.size() == 3)
 199                 return (backendId == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU);
 200             if (kernel_size.empty() || kernel_size.size() == 2)
 201                 return backendId == DNN_BACKEND_OPENCV ||
 202                        (backendId == DNN_BACKEND_HALIDE && haveHalide() &&
 203                            (type == MAX || (type == AVE && !pad_t && !pad_l && !pad_b && !pad_r))) ||
 204                        (backendId == DNN_BACKEND_VKCOM && haveVulkan() &&
 205                            (type == MAX || type == AVE));
 206             else
 207                 return false;
 208         }
 209     }
 210
 211 #ifdef HAVE_OPENCL
 212     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
 213     {
 214         std::vector<UMat> inputs;
 215         std::vector<UMat> outputs;
 216
 217         bool use_half = (inps.depth() == CV_16S);
 218         inps.getUMatVector(inputs);
 219         outs.getUMatVector(outputs);
 220
 221         if (poolOp.empty())
 222         {
 223             OCL4DNNPoolConfig config;
 224
 225             config.in_shape = shape(inputs[0]);
 226             config.out_shape = shape(outputs[0]);
 227             config.kernel = kernel;
 228             config.pad_l = pad_l;
 229             config.pad_t = pad_t;
 230             config.pad_r = pad_r;
 231             config.pad_b = pad_b;
 232             config.stride = stride;
 233             config.channels = inputs[0].size[1];
 234             config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
 235                                 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
 236                                                LIBDNN_POOLING_METHOD_STO);
 237             config.avePoolPaddedArea = avePoolPaddedArea;
 238             config.computeMaxIdx = computeMaxIdx;
 239             config.use_half = use_half;
 240             poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
 241         }
 242
 243         CV_Assert_N(inputs.size() == 1, !outputs.empty(), !computeMaxIdx || outputs.size() == 2);
 244         UMat& inpMat = inputs[0];
 245         UMat& outMat = outputs[0];
 246         UMat maskMat = computeMaxIdx ? outputs[1] : UMat();
 247
 248         CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
 249
 250         return poolOp->Forward(inpMat, outMat, maskMat);
 251     }
 252 #endif
 253
 254     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
 255     {
 256         CV_TRACE_FUNCTION();
 257         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 258
 259         if (type == MAX || type == AVE || type == STOCHASTIC)
 260         {
 261             CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
 262                        forward_ocl(inputs_arr, outputs_arr, internals_arr))
 263         }
 264         if (inputs_arr.depth() == CV_16S)
 265         {
 266             forward_fallback(inputs_arr, outputs_arr, internals_arr);
 267             return;
 268         }
 269
 270         std::vector<Mat> inputs, outputs;
 271         inputs_arr.getMatVector(inputs);
 272         outputs_arr.getMatVector(outputs);
 273
 274         switch (type)
 275         {
 276             case MAX:
 277             {
 278                 CV_Assert_N(inputs.size() == 1, !computeMaxIdx || outputs.size() == 2);
 279                 Mat mask = computeMaxIdx ? outputs[1] : Mat();
 280                 maxPooling(inputs[0], outputs[0], mask);
 281                 break;
 282             }
 283             case AVE:
 284                 CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
 285                 avePooling(inputs[0], outputs[0]);
 286                 break;
 287             case ROI: case PSROI:
 288                 CV_Assert_N(inputs.size() == 2, outputs.size() == 1);
 289                 roiPooling(inputs[0], inputs[1], outputs[0]);
 290                 break;
 291             default:
 292                 CV_Error(Error::StsNotImplemented, "Not implemented");
 293                 break;
 294         }
 295     }
 296
 297 #ifdef HAVE_CUDA
 298     Ptr<BackendNode> initCUDA(
 299         void *context_,
 300         const std::vector<Ptr<BackendWrapper>>& inputs,
 301         const std::vector<Ptr<BackendWrapper>>& outputs
 302     ) override
 303     {
 304         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 305
 306         auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
 307         auto input_shape = input_wrapper->getShape();
 308
 309         /* storing max indices is a special case and we deal with it separately */
 310         if (computeMaxIdx) {
 311             CV_Assert(type == MAX);
 312
 313             cuda4dnn::MaxPoolingConfiguration config;
 314             config.window_size.assign(std::begin(kernel_size), std::end(kernel_size));
 315             config.strides.assign(std::begin(strides), std::end(strides));
 316
 317             if (padMode.empty())
 318             {
 319                 config.padMode = MaxPoolingConfiguration::PaddingMode::MANUAL;
 320                 config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
 321             }
 322             else if (padMode == "VALID")
 323             {
 324                 config.padMode = MaxPoolingConfiguration::PaddingMode::VALID;
 325             }
 326             else if (padMode == "SAME")
 327             {
 328                 config.padMode = MaxPoolingConfiguration::PaddingMode::SAME;
 329             }
 330             else
 331             {
 332                 CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by PoolingLayer");
 333             }
 334
 335             config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
 336
 337             return make_cuda_node<cuda4dnn::MaxPoolingOp>(preferableTarget, std::move(context->stream), config);
 338         }
 339
 340         PoolingConfiguration config;
 341         if (type == MAX)
 342         {
 343             config.poolMode = PoolingConfiguration::PoolingMode::MAX;
 344         }
 345         else if (type == AVE && !avePoolPaddedArea)
 346         {
 347             config.poolMode = PoolingConfiguration::PoolingMode::AVERAGE_EXCLUDE_PADDING;
 348         }
 349         else if (type == AVE && avePoolPaddedArea)
 350         {
 351             config.poolMode = PoolingConfiguration::PoolingMode::AVERAGE_INCLUDE_PADDING;
 352         }
 353         else
 354         {
 355             CV_Error(Error::StsNotImplemented, "Unsupported pooling mode");
 356         }
 357
 358         config.window_size.assign(std::begin(kernel_size), std::end(kernel_size));
 359         config.strides.assign(std::begin(strides), std::end(strides));
 360
 361         if (padMode.empty())
 362         {
 363             config.padMode = PoolingConfiguration::PaddingMode::MANUAL;
 364             config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
 365             config.pads_end.assign(std::begin(pads_end), std::end(pads_end));
 366         }
 367         else if (padMode == "VALID")
 368         {
 369             config.padMode = PoolingConfiguration::PaddingMode::VALID;
 370         }
 371         else if (padMode == "SAME")
 372         {
 373             config.padMode = PoolingConfiguration::PaddingMode::SAME;
 374         }
 375         else
 376         {
 377             CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by PoolingLayer");
 378         }
 379
 380         if (ceilMode)
 381             config.roundMode = PoolingConfiguration::RoundingMode::CEIL;
 382         else
 383             config.roundMode = PoolingConfiguration::RoundingMode::FLOOR;
 384
 385         config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
 386
 387         return make_cuda_node<cuda4dnn::PoolingOp>(preferableTarget, std::move(context->cudnn_handle), config);
 388     }
 389 #endif
 390
 391     virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
 392     {
 393 #ifdef HAVE_VULKAN
 394         int padding_mode;
 395         vkcom::PoolType pool_type;
 396         int filter_size[2] = {kernel.height, kernel.width};
 397         int pad_size[2] = {pad.height, pad.width};
 398         int stride_size[2] = {stride.height, stride.width};
 399         pool_type = type == MAX ? vkcom::kPoolTypeMax:
 400                    (type == AVE ? vkcom::kPoolTypeAvg:
 401                             vkcom::kPoolTypeNum);
 402
 403         if (padMode.empty())
 404         {
 405             padding_mode = vkcom::kPaddingModeCaffe;
 406         }
 407         else if (padMode == "VALID")
 408         {
 409             padding_mode = vkcom::kPaddingModeValid;
 410         }
 411         else if (padMode == "SAME")
 412         {
 413             padding_mode = vkcom::kPaddingModeSame;
 414         }
 415         else
 416             CV_Error(Error::StsError, "Unsupported padding mode " + padMode);
 417
 418         std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPool(filter_size, pad_size,
 419                                                             stride_size, padding_mode,
 420                                                             pool_type, avePoolPaddedArea));
 421         return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
 422 #endif
 423         return Ptr<BackendNode>();
 424     }
 425
 426     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
 427     {
 428         if (type == MAX)
 429             return initMaxPoolingHalide(inputs);
 430         else if (type == AVE)
 431             return initAvePoolingHalide(inputs);
 432         else
 433             return Ptr<BackendNode>();
 434     }
 435
 436 #ifdef HAVE_INF_ENGINE
 437     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
 438     {
 439         if (type == MAX || type == AVE)
 440         {
 441             InferenceEngine::Builder::PoolingLayer ieLayer(name);
 442
 443             ieLayer.setKernel(kernel_size);
 444             ieLayer.setStrides(strides);
 445             ieLayer.setPaddingsBegin(pads_begin);
 446             ieLayer.setPaddingsEnd(pads_end);
 447
 448             ieLayer.setPoolingType(type == MAX ?
 449                                    InferenceEngine::Builder::PoolingLayer::PoolingType::MAX :
 450                                    InferenceEngine::Builder::PoolingLayer::PoolingType::AVG);
 451             ieLayer.setRoundingType(ceilMode ?
 452                                     InferenceEngine::Builder::PoolingLayer::RoundingType::CEIL :
 453                                     InferenceEngine::Builder::PoolingLayer::RoundingType::FLOOR);
 454             ieLayer.setExcludePad(type == AVE && padMode == "SAME");
 455
 456             InferenceEngine::Builder::Layer l = ieLayer;
 457             if (!padMode.empty())
 458                 l.getParameters()["auto_pad"] = padMode == "VALID" ? std::string("valid") : std::string("same_upper");
 459             return Ptr<BackendNode>(new InfEngineBackendNode(l));
 460         }
 461         else if (type == ROI)
 462         {
 463             InferenceEngine::Builder::ROIPoolingLayer ieLayer(name);
 464             ieLayer.setSpatialScale(spatialScale);
 465             ieLayer.setPooled({pooledSize.height, pooledSize.width});
 466             ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
 467             return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 468         }
 469         else if (type == PSROI)
 470         {
 471             InferenceEngine::Builder::PSROIPoolingLayer ieLayer(name);
 472             ieLayer.setSpatialScale(spatialScale);
 473             ieLayer.setOutputDim(psRoiOutChannels);
 474             ieLayer.setGroupSize(pooledSize.width);
 475             ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
 476             return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 477         }
 478         else
 479             CV_Error(Error::StsNotImplemented, "Unsupported pooling type");
 480         return Ptr<BackendNode>();
 481     }
 482 #endif  // HAVE_INF_ENGINE
 483
 484
 485     class PoolingInvoker : public ParallelLoopBody
 486     {
 487     public:
 488         const Mat* src, *rois;
 489         Mat *dst, *mask;
 490         Size kernel, stride;
 491         int pad_l, pad_t, pad_r, pad_b;
 492         bool avePoolPaddedArea;
 493         int nstripes;
 494         bool computeMaxIdx;
 495         std::vector<int> ofsbuf;
 496         int poolingType;
 497         float spatialScale;
 498
 499         std::vector<size_t> pads_begin, pads_end;
 500         std::vector<size_t> kernel_size;
 501         std::vector<size_t> strides;
 502
 503         PoolingInvoker() : src(0), rois(0), dst(0), mask(0), pad_l(0), pad_t(0), pad_r(0), pad_b(0),
 504                            avePoolPaddedArea(false), nstripes(0),
 505                            computeMaxIdx(0), poolingType(MAX), spatialScale(0) {}
 506
 507         static void run(const Mat& src, const Mat& rois, Mat& dst, Mat& mask,
 508                         std::vector<size_t> kernel_size, std::vector<size_t> strides,
 509                         std::vector<size_t> pads_begin, std::vector<size_t> pads_end,
 510                         bool avePoolPaddedArea, int poolingType, float spatialScale,
 511                         bool computeMaxIdx, int nstripes)
 512         {
 513             CV_Assert_N(
 514                       src.isContinuous(), dst.isContinuous(),
 515                       src.type() == CV_32F, src.type() == dst.type(),
 516                       src.dims == 4 || src.dims == 5, dst.dims == 4 || dst.dims == 5,
 517                       (((poolingType == ROI || poolingType == PSROI) &&
 518                       dst.size[0] == rois.size[0]) || src.size[0] == dst.size[0]),
 519                       poolingType == PSROI || src.size[1] == dst.size[1],
 520                       (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
 521
 522             PoolingInvoker p;
 523
 524             p.src = &src;
 525             p.rois = &rois;
 526             p.dst = &dst;
 527
 528             p.kernel_size = kernel_size;
 529             p.strides = strides;
 530             p.pads_begin = pads_begin;
 531             p.pads_end = pads_end;
 532
 533             p.mask = &mask;
 534             p.kernel = Size(kernel_size[1], kernel_size[0]);
 535             p.stride = Size(strides[1], strides[0]);
 536             p.pad_l = pads_begin.back();
 537             p.pad_t = pads_begin[pads_begin.size() - 2];
 538             p.pad_r = pads_end.back();
 539             p.pad_b = pads_end[pads_end.size() - 2];
 540
 541             p.avePoolPaddedArea = avePoolPaddedArea;
 542             p.nstripes = nstripes;
 543             p.computeMaxIdx = computeMaxIdx;
 544             p.poolingType = poolingType;
 545             p.spatialScale = spatialScale;
 546
 547             if( !computeMaxIdx )
 548             {
 549                 int height = src.size[src.dims - 2];
 550                 int width = src.size[src.dims - 1];
 551
 552                 int kernel_d = (kernel_size.size() == 3) ? kernel_size[0] : 1;
 553                 int kernel_h = kernel_size[kernel_size.size() - 2];
 554                 int kernel_w = kernel_size.back();
 555
 556                 p.ofsbuf.resize(kernel_d * kernel_h * kernel_w);
 557                 for (int i = 0; i < kernel_d; ++i) {
 558                     for (int j = 0; j < kernel_h; ++j) {
 559                         for (int k = 0; k < kernel_w; ++k) {
 560                             p.ofsbuf[i * kernel_h * kernel_w + j * kernel_w + k] = width * height * i + width * j + k;
 561                         }
 562                     }
 563                 }
 564             }
 565
 566             parallel_for_(Range(0, nstripes), p, nstripes);
 567         }
 568
 569         void operator()(const Range& r) const CV_OVERRIDE
 570         {
 571             int channels = dst->size[1];
 572
 573             bool isPool2D = src->dims == 4;
 574             int depth = !isPool2D? dst->size[2] : 1;
 575             int height = dst->size[dst->dims - 2];
 576             int width = dst->size[dst->dims - 1];
 577
 578             int inp_depth = !isPool2D? src->size[2] : 1;
 579             int inp_height = src->size[src->dims - 2];
 580             int inp_width = src->size[src->dims - 1];
 581
 582             size_t total = dst->total();
 583             size_t stripeSize = (total + nstripes - 1)/nstripes;
 584             size_t stripeStart = r.start*stripeSize;
 585             size_t stripeEnd = std::min(r.end*stripeSize, total);
 586
 587             int kernel_d = !isPool2D? kernel_size[0] : 1;
 588             int kernel_h = kernel_size[kernel_size.size() - 2];
 589             int kernel_w = kernel_size.back();
 590
 591             int stride_d = !isPool2D? strides[0] : 0;
 592             int stride_h = strides[strides.size() - 2];
 593             int stride_w = strides.back();
 594             bool compMaxIdx = computeMaxIdx;
 595
 596 #if CV_SIMD128
 597             const int* ofsptr = ofsbuf.empty() ? 0 : (const int*)&ofsbuf[0];
 598             if (poolingType == MAX && !compMaxIdx && !ofsptr)
 599                 CV_Error(Error::StsBadArg, "ofsbuf should be initialized in this mode");
 600             v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
 601             v_float32x4 ones = v_setall_f32(1.f);
 602             v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
 603 #endif
 604
 605             for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
 606             {
 607                 size_t ofs = ofs0;
 608                 int x0 = (int)(ofs % width);
 609                 ofs /= width;
 610                 int y0 = (int)(ofs % height);
 611                 ofs /= height;
 612
 613                 int d0 = (int)(ofs % depth);
 614                 ofs /= depth;
 615
 616                 int c = (int)(ofs % channels);
 617                 int n = (int)(ofs / channels);
 618                 int ystart, yend;
 619                 int dstart = 0, dend = 1;
 620
 621                 const float *srcData = 0;
 622                 if (poolingType == ROI)
 623                 {
 624                     const float *roisData = rois->ptr<float>(n);
 625                     int ystartROI = roundRoiSize(roisData[2] * spatialScale);
 626                     int yendROI = roundRoiSize(roisData[4] * spatialScale);
 627                     int roiHeight = std::max(yendROI - ystartROI + 1, 1);
 628                     float roiRatio = (float)roiHeight / height;
 629
 630                     ystart = ystartROI + y0 * roiRatio;
 631                     yend = ystartROI + std::ceil((y0 + 1) * roiRatio);
 632
 633                     CV_Assert(roisData[0] < src->size[0]);
 634                     srcData = src->ptr<float>(roisData[0], c);
 635                 }
 636                 else if (poolingType == PSROI)
 637                 {
 638                     const float *roisData = rois->ptr<float>(n);
 639                     float ystartROI = roundRoiSize(roisData[2]) * spatialScale;
 640                     float yendROI = roundRoiSize(roisData[4] + 1) * spatialScale;
 641                     float roiHeight = std::max(yendROI - ystartROI, 0.1f);
 642                     float roiRatio = roiHeight / height;
 643
 644                     ystart = (int)std::floor(ystartROI + y0 * roiRatio);
 645                     yend = (int)std::ceil(ystartROI + (y0 + 1) * roiRatio);
 646                 }
 647                 else
 648                 {
 649                     int pad_d_begin = (pads_begin.size() == 3) ? pads_begin[0] : 0;
 650                     dstart = d0 * stride_d - pad_d_begin;
 651                     dend = min(dstart + kernel_d, (int)(inp_depth + pads_end[0]));
 652
 653                     ystart = y0 * stride_h - pad_t;
 654                     yend = min(ystart + kernel_h, inp_height + pad_b);
 655                     srcData = src->ptr<float>(n, c);
 656                 }
 657                 int ddelta = dend - dstart;
 658                 dstart = max(dstart, 0);
 659                 dend = min(dend, inp_depth);
 660                 int ydelta = yend - ystart;
 661                 ystart = max(ystart, 0);
 662                 yend = min(yend, inp_height);
 663                 float *dstData = &dst->ptr<float>(n, c, d0)[y0 * width];
 664                 float *dstMaskData = mask->data ? &mask->ptr<float>(n, c, d0)[y0 * width] : 0;
 665
 666                 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
 667                 ofs0 += delta;
 668                 int x1 = x0 + delta;
 669
 670                 if( poolingType == MAX)
 671                     for( ; x0 < x1; x0++ )
 672                     {
 673                         int xstart = x0 * stride_w - pad_l;
 674                         int xend = min(xstart + kernel_w, inp_width);
 675                         xstart = max(xstart, 0);
 676                         if (xstart >= xend || ystart >= yend)
 677                         {
 678                             dstData[x0] = 0;
 679                             if (compMaxIdx && dstMaskData)
 680                                 dstMaskData[x0] = -1;
 681                             continue;
 682                         }
 683 #if CV_SIMD128
 684                         if( isPool2D && xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_l + kernel_w < inp_width )
 685                         {
 686                             if( compMaxIdx )
 687                             {
 688                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
 689                                 v_float32x4 max_val1 = max_val0;
 690                                 v_float32x4 max_idx0 = v_setall_f32(-1.f);
 691                                 v_float32x4 max_idx1 = max_idx0;
 692                                 int index0 = ystart * inp_width + xstart;
 693                                 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
 694                                 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
 695
 696                                 for (int y = ystart; y < yend; ++y)
 697                                 {
 698                                     for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
 699                                     {
 700                                         const int index = y * inp_width + x;
 701                                         v_float32x4 v0(srcData[index], srcData[index + stride_w],
 702                                                        srcData[index + stride_w*2], srcData[index + stride_w*3]);
 703                                         v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
 704                                                        srcData[index + stride_w*6], srcData[index + stride_w*7]);
 705                                         max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
 706                                         max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
 707                                         max_val0 = v_max(max_val0, v0);
 708                                         max_val1 = v_max(max_val1, v1);
 709                                     }
 710                                     idx0 += idx_delta;
 711                                     idx1 += idx_delta;
 712                                 }
 713                                 v_store(dstData + x0, max_val0);
 714                                 v_store(dstData + x0 + 4, max_val1);
 715                                 if (dstMaskData)
 716                                 {
 717                                     v_store(dstMaskData + x0, max_idx0);
 718                                     v_store(dstMaskData + x0 + 4, max_idx1);
 719                                 }
 720                                 x0 += 7;
 721                             }
 722                             else
 723                             {
 724                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
 725                                 v_float32x4 max_val1 = max_val0;
 726
 727                                 if( yend - ystart == kernel_h )
 728                                 {
 729                                     const float* srcData1 = srcData + ystart*inp_width + xstart;
 730                                     if( stride_w == 1 )
 731                                         for (int k = 0; k < kernel_w*kernel_h; k++)
 732                                         {
 733                                             int index = ofsptr[k];
 734                                             v_float32x4 v0 = v_load(srcData1 + index);
 735                                             v_float32x4 v1 = v_load(srcData1 + index + 4);
 736                                             max_val0 = v_max(max_val0, v0);
 737                                             max_val1 = v_max(max_val1, v1);
 738                                         }
 739                                     else if( stride_w == 2 )
 740                                         for (int k = 0; k < kernel_w*kernel_h; k++)
 741                                         {
 742                                             int index = ofsptr[k];
 743                                             v_float32x4 v0, v1, dummy;
 744                                             v_load_deinterleave(srcData1 + index, v0, dummy);     // f0  f2  f4  f6  ,f1  f3  f5  f7
 745                                             v_load_deinterleave(srcData1 + index + 8, v1, dummy); // f8  f10 f12 f14 ,f9  f11 f13 f15
 746                                             max_val0 = v_max(max_val0, v0);
 747                                             max_val1 = v_max(max_val1, v1);
 748                                         }
 749                                     else
 750                                         for (int k = 0; k < kernel_w*kernel_h; k++)
 751                                         {
 752                                             int index = ofsptr[k];
 753                                             v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
 754                                                            srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
 755                                             v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
 756                                                            srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
 757                                             max_val0 = v_max(max_val0, v0);
 758                                             max_val1 = v_max(max_val1, v1);
 759                                         }
 760                                 }
 761                                 else
 762                                 {
 763                                     for (int y = ystart; y < yend; ++y)
 764                                     {
 765                                         for (int x = xstart; x < xend; ++x)
 766                                         {
 767                                             const int index = y * inp_width + x;
 768                                             v_float32x4 v0(srcData[index], srcData[index + stride_w],
 769                                                            srcData[index + stride_w*2], srcData[index + stride_w*3]);
 770                                             v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
 771                                                            srcData[index + stride_w*6], srcData[index + stride_w*7]);
 772                                             max_val0 = v_max(max_val0, v0);
 773                                             max_val1 = v_max(max_val1, v1);
 774                                         }
 775                                     }
 776                                 }
 777                                 v_store(dstData + x0, max_val0);
 778                                 v_store(dstData + x0 + 4, max_val1);
 779                                 x0 += 7;
 780                             }
 781                         }
 782                         else
 783 #endif
 784                         {
 785                             float max_val = -FLT_MAX;
 786                             if( compMaxIdx )
 787                             {
 788                                 int max_index = -1;
 789                                 for (int d = dstart; d < dend; ++d)
 790                                     for (int y = ystart; y < yend; ++y)
 791                                         for (int x = xstart; x < xend; ++x)
 792                                         {
 793                                             const int index = d * inp_width * inp_height + y * inp_width + x;
 794                                             float val = srcData[index];
 795                                             if (val > max_val)
 796                                             {
 797                                                 max_val = val;
 798                                                 max_index = index;
 799                                             }
 800                                         }
 801                                 dstData[x0] = max_val;
 802                                 if (dstMaskData)
 803                                     dstMaskData[x0] = max_index;
 804                             }
 805                             else
 806                             {
 807                                 for (int d = dstart; d < dend; ++d) {
 808                                     for (int y = ystart; y < yend; ++y) {
 809                                         for (int x = xstart; x < xend; ++x) {
 810                                             const int index = d * inp_width * inp_height + y * inp_width + x;
 811                                             float val = srcData[index];
 812                                             max_val = std::max(max_val, val);
 813                                         }
 814                                     }
 815                                 }
 816                                 dstData[x0] = max_val;
 817                             }
 818                         }
 819                     }
 820                 else if (poolingType == AVE)
 821                 {
 822                     for( ; x0 < x1; ++x0)
 823                     {
 824                         int xstart = x0 * stride_w - pad_l;
 825                         int xend = min(xstart + kernel_w, inp_width + pad_r);
 826                         int xdelta = xend - xstart;
 827                         xstart = max(xstart, 0);
 828                         xend = min(xend, inp_width);
 829                         float inv_kernel_area = avePoolPaddedArea ? xdelta * ydelta * ddelta :
 830                                                 ((dend - dstart) * (yend - ystart) * (xend - xstart));
 831                         inv_kernel_area = 1.0 / inv_kernel_area;
 832 #if CV_SIMD128
 833                         if( isPool2D && xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_l + kernel_w < inp_width )
 834                         {
 835                             v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
 836                             v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
 837
 838                             for (int y = ystart; y < yend; ++y)
 839                             {
 840                                 for (int x = xstart; x < xend; ++x)
 841                                 {
 842                                     const int index = y * inp_width + x;
 843                                     v_float32x4 v0(srcData[index], srcData[index + stride_w],
 844                                                    srcData[index + stride_w*2], srcData[index + stride_w*3]);
 845                                     v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
 846                                                    srcData[index + stride_w*6], srcData[index + stride_w*7]);
 847                                     sum_val0 += v0;
 848                                     sum_val1 += v1;
 849                                 }
 850                             }
 851                             v_store(dstData + x0, sum_val0*ikarea);
 852                             v_store(dstData + x0 + 4, sum_val1*ikarea);
 853                             x0 += 7;
 854                         }
 855                         else
 856 #endif
 857                         {
 858                             float sum_val = 0.f;
 859                             for (int d = dstart; d < dend; ++d) {
 860                                 for (int y = ystart; y < yend; ++y) {
 861                                     for (int x = xstart; x < xend; ++x) {
 862                                         const int index = d * inp_width * inp_height + y * inp_width + x;
 863                                         float val = srcData[index];
 864                                         sum_val += val;
 865                                     }
 866                                 }
 867                             }
 868                             dstData[x0] = sum_val*inv_kernel_area;
 869                         }
 870                     }
 871                 }
 872                 else if (poolingType == ROI)
 873                 {
 874                     const float *roisData = rois->ptr<float>(n);
 875                     int xstartROI = roundRoiSize(roisData[1] * spatialScale);
 876                     int xendROI = roundRoiSize(roisData[3] * spatialScale);
 877                     int roiWidth = std::max(xendROI - xstartROI + 1, 1);
 878                     float roiRatio = (float)roiWidth / width;
 879                     for( ; x0 < x1; x0++ )
 880                     {
 881                         int xstart = xstartROI + x0 * roiRatio;
 882                         int xend = xstartROI + std::ceil((x0 + 1) * roiRatio);
 883                         xstart = max(xstart, 0);
 884                         xend = min(xend, inp_width);
 885                         if (xstart >= xend || ystart >= yend)
 886                         {
 887                             dstData[x0] = 0;
 888                             if (compMaxIdx && dstMaskData)
 889                                 dstMaskData[x0] = -1;
 890                             continue;
 891                         }
 892                         float max_val = -FLT_MAX;
 893                         for (int y = ystart; y < yend; ++y)
 894                             for (int x = xstart; x < xend; ++x)
 895                             {
 896                                 const int index = y * inp_width + x;
 897                                 float val = srcData[index];
 898                                 max_val = std::max(max_val, val);
 899                             }
 900                         dstData[x0] = max_val;
 901                     }
 902                 }
 903                 else  // PSROI
 904                 {
 905                     const float *roisData = rois->ptr<float>(n);
 906                     CV_Assert(roisData[0] < src->size[0]);
 907                     float xstartROI = roundRoiSize(roisData[1]) * spatialScale;
 908                     float xendROI = roundRoiSize(roisData[3] + 1) * spatialScale;
 909                     float roiWidth = std::max(xendROI - xstartROI, 0.1f);
 910                     float roiRatio = roiWidth / width;
 911                     for( ; x0 < x1; x0++ )
 912                     {
 913                         int xstart = (int)std::floor(xstartROI + x0 * roiRatio);
 914                         int xend = (int)std::ceil(xstartROI + (x0 + 1) * roiRatio);
 915                         xstart = max(xstart, 0);
 916                         xend = min(xend, inp_width);
 917                         if (xstart >= xend || ystart >= yend)
 918                         {
 919                             dstData[x0] = 0;
 920                             continue;
 921                         }
 922
 923                         srcData = src->ptr<float>(roisData[0], (c * height + y0) * width + x0);
 924                         float sum_val = 0.f;
 925                         for (int y = ystart; y < yend; ++y)
 926                             for (int x = xstart; x < xend; ++x)
 927                             {
 928                                 const int index = y * inp_width + x;
 929                                 float val = srcData[index];
 930                                 sum_val += val;
 931                             }
 932                         dstData[x0] = sum_val / ((yend - ystart) * (xend - xstart));
 933                     }
 934                 }
 935             }
 936         }
 937     };
 938
 939     void maxPooling(Mat &src, Mat &dst, Mat &mask)
 940     {
 941         const int nstripes = getNumThreads();
 942         Mat rois;
 943         PoolingInvoker::run(src, rois, dst, mask, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, spatialScale, computeMaxIdx, nstripes);
 944     }
 945
 946     void avePooling(Mat &src, Mat &dst)
 947     {
 948         const int nstripes = getNumThreads();
 949         Mat rois, mask;
 950         PoolingInvoker::run(src, rois, dst, mask, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, spatialScale, computeMaxIdx, nstripes);
 951     }
 952
 953     void roiPooling(const Mat &src, const Mat &rois, Mat &dst)
 954     {
 955         const int nstripes = getNumThreads();
 956         Mat mask;
 957         kernel_size.resize(2);
 958         strides.resize(2);
 959         pads_begin.resize(2);
 960         pads_end.resize(2);
 961         PoolingInvoker::run(src, rois, dst, mask, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, spatialScale, computeMaxIdx, nstripes);
 962     }
 963
 964     virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
 965     {
 966 #ifdef HAVE_HALIDE
 967         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
 968         const int inWidth = inputBuffer.width();
 969         const int inHeight = inputBuffer.height();
 970
 971         Halide::Var x("x"), y("y"), c("c"), n("n");
 972         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
 973         Halide::RDom r(0, kernel.width, 0, kernel.height);
 974         Halide::Expr kx, ky;
 975         if(pad_l || pad_t)
 976         {
 977             kx = clamp(x * stride.width + r.x - pad_l, 0, inWidth - 1);
 978             ky = clamp(y * stride.height + r.y - pad_t, 0, inHeight - 1);
 979         }
 980         else
 981         {
 982             kx = min(x * stride.width + r.x, inWidth - 1);
 983             ky = min(y * stride.height + r.y, inHeight - 1);
 984         }
 985
 986         // Halide::argmax returns tuple (r.x, r.y, max).
 987         Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
 988
 989         // Compute offset from argmax in range [0, kernel_size).
 990         Halide::Expr max_index;
 991         if(pad_l || pad_t)
 992         {
 993             max_index = clamp(y * stride.height + res[1] - pad_t,
 994                               0, inHeight - 1) * inWidth +
 995                         clamp(x * stride.width + res[0] - pad_l,
 996                               0, inWidth - 1);
 997         }
 998         else
 999         {
1000             max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
1001                         min(x * stride.width + res[0], inWidth - 1);
1002         }
1003         top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
1004         return Ptr<BackendNode>(new HalideBackendNode(top));
1005 #endif  // HAVE_HALIDE
1006         return Ptr<BackendNode>();
1007     }
1008
1009     virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
1010     {
1011 #ifdef HAVE_HALIDE
1012         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
1013
1014         const int inW = inputBuffer.width(), inH = inputBuffer.height();
1015         if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
1016         {
1017             CV_Error(cv::Error::StsNotImplemented,
1018                      "Halide backend for average pooling with partial "
1019                      "kernels is not implemented");
1020         }
1021
1022         const float norm = 1.0f / (kernel.width * kernel.height);
1023
1024         Halide::Var x("x"), y("y"), c("c"), n("n");
1025         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
1026         Halide::RDom r(0, kernel.width, 0, kernel.height);
1027         top(x, y, c, n) = sum(
1028             inputBuffer(x * stride.width + r.x,
1029                         y * stride.height + r.y, c, n)) * norm;
1030         return Ptr<BackendNode>(new HalideBackendNode(top));
1031 #endif  // HAVE_HALIDE
1032         return Ptr<BackendNode>();
1033     }
1034
1035     virtual void applyHalideScheduler(Ptr<BackendNode>& node,
1036                                       const std::vector<Mat*> &inputs,
1037                                       const std::vector<Mat> &outputs,
1038                                       int targetId) const CV_OVERRIDE
1039     {
1040 #ifdef  HAVE_HALIDE
1041         if (targetId != DNN_TARGET_CPU)
1042         {
1043             Layer::applyHalideScheduler(node, inputs, outputs, targetId);
1044             return;
1045         }
1046         Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
1047                     xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
1048         Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
1049
1050         int outW, outH, outC, outN;
1051         getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
1052
1053         if (outW < 8 || outH < 8)
1054         {
1055             if (outC > 8)
1056                 top.split(c, co, ci, 8)
1057                    .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
1058                    .parallel(tile)
1059                    .vectorize(ci);
1060             else
1061             {
1062                 top.fuse(y, c, tile).fuse(n, tile, tile)
1063                    .parallel(tile);
1064                 if (outW > 1)
1065                     top.vectorize(x);
1066             }
1067         }
1068         else
1069         {
1070             if (outC > 8)
1071                 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
1072                    .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
1073                    .parallel(tile)
1074                    .vectorize(xi);
1075             else
1076                 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
1077                    .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
1078                    .parallel(tile)
1079                    .vectorize(xi);
1080         }
1081 #endif  // HAVE_HALIDE
1082     }
1083
1084     bool getMemoryShapes(const std::vector<MatShape> &inputs,
1085                          const int requiredOutputs,
1086                          std::vector<MatShape> &outputs,
1087                          std::vector<MatShape> &internals) const CV_OVERRIDE
1088     {
1089         CV_Assert(inputs.size() != 0);
1090
1091         std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
1092         std::vector<int> outShape(inputs[0].begin(), inputs[0].begin() + 2);
1093
1094         if (globalPooling)
1095         {
1096             outShape.push_back(1);
1097             outShape.push_back(1);
1098         }
1099         else if (type == ROI || type == PSROI)
1100         {
1101             outShape.push_back(pooledSize.height);
1102             outShape.push_back(pooledSize.width);
1103         }
1104         else if (padMode.empty())
1105         {
1106             for (int i = 0; i < kernel_size.size(); i++) {
1107                 float dst = (float)(inpShape[i] + pads_begin[i] + pads_end[i] - kernel_size[i]) / strides[i];
1108                 outShape.push_back(1 + (ceilMode ? ceil(dst) : floor(dst)));
1109             }
1110
1111             // If we have padding, ensure that the last pooling starts strictly
1112             // inside the image (instead of at the padding); otherwise clip the last.
1113             for (int i = 0; i < pads_end.size(); i++) {
1114                 if (pads_end[i] && (outShape[2 + i] - 1) * strides[i] >= inpShape[i] + pads_end[i]) {
1115                     --outShape[2 + i];
1116                     CV_Assert((outShape[2 + i] - 1) * strides[i] < inpShape[i] + pads_end[i]);
1117                 }
1118             }
1119         }
1120         else
1121         {
1122             getConvPoolOutParams(inpShape, kernel_size, strides, padMode, std::vector<size_t>(kernel_size.size(), 1), outShape);
1123         }
1124         if (type == ROI)
1125         {
1126             CV_Assert(inputs.size() == 2);
1127             outShape[0] = inputs[1][0];  // Number of proposals;
1128         }
1129         else if (type == PSROI)
1130         {
1131             CV_Assert(inputs.size() == 2);
1132             CV_Assert(psRoiOutChannels * pooledSize.width * pooledSize.height == inputs[0][1]);
1133             outShape[0] = inputs[1][0];  // Number of proposals;
1134             outShape[1] = psRoiOutChannels;
1135         }
1136         int numOutputs = requiredOutputs ? requiredOutputs : (type == MAX ? 2 : 1);
1137         CV_Assert(numOutputs == 1 || (numOutputs == 2 && type == MAX));
1138
1139         outputs.assign(numOutputs, outShape);
1140
1141         return false;
1142     }
1143
1144     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
1145                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
1146     {
1147         CV_UNUSED(inputs); // suppress unused variable warning
1148         long flops = 0;
1149         size_t karea = std::accumulate(kernel_size.begin(), kernel_size.end(),
1150                                     1, std::multiplies<size_t>());
1151         for(int i = 0; i < outputs.size(); i++)
1152         {
1153             if (type == MAX)
1154             {
1155                 if (i%2 == 0)
1156                     flops += total(outputs[i])*karea;
1157             }
1158             else
1159             {
1160                 flops += total(outputs[i])*(karea + 1);
1161             }
1162         }
1163         return flops;
1164     }
1165 private:
1166     enum Type
1167     {
1168         MAX,
1169         AVE,
1170         STOCHASTIC,
1171         ROI,   // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
1172         PSROI  // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
1173     };
1174 };
1175
1176 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
1177 {
1178     return Ptr<PoolingLayer>(new PoolingLayerImpl(params));
1179 }
1180
1181 }
1182 }