modules/dnn/src/layers/pooling_layer.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "../precomp.hpp"
  44 #include "layers_common.hpp"
  45 #include "opencv2/core/hal/intrin.hpp"
  46 #include "op_halide.hpp"
  47 #include "opencl_kernels_dnn.hpp"
  48 #include <float.h>
  49 #include <algorithm>
  50 using std::max;
  51 using std::min;
  52 #ifdef HAVE_OPENCL
  53 using namespace cv::dnn::ocl4dnn;
  54 #endif
  55
  56 namespace cv
  57 {
  58 namespace dnn
  59 {
  60 static inline int scaleAndRoundRoi(float f, float scale)
  61 {
  62     return (int)(f * scale + (f >= 0.f ? 0.5f : -0.5f));
  63 }
  64
  65 class PoolingLayerImpl : public PoolingLayer
  66 {
  67 public:
  68     PoolingLayerImpl(const LayerParams& params)
  69     {
  70         type = MAX;
  71         computeMaxIdx = true;
  72         globalPooling = false;
  73
  74         if (params.has("pool"))
  75         {
  76             String pool = params.get<String>("pool").toLowerCase();
  77             if (pool == "max")
  78                 type = MAX;
  79             else if (pool == "ave")
  80                 type = AVE;
  81             else if (pool == "stochastic")
  82                 type = STOCHASTIC;
  83             else
  84                 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
  85             getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
  86                                    pad.height, pad.width, stride.height, stride.width, padMode);
  87         }
  88         else if (params.has("pooled_w") || params.has("pooled_h") || params.has("spatial_scale"))
  89         {
  90             type = ROI;
  91             computeMaxIdx = false;
  92         }
  93         setParamsFrom(params);
  94         ceilMode = params.get<bool>("ceil_mode", true);
  95         pooledSize.width = params.get<uint32_t>("pooled_w", 1);
  96         pooledSize.height = params.get<uint32_t>("pooled_h", 1);
  97         spatialScale = params.get<float>("spatial_scale", 1);
  98     }
  99
 100 #ifdef HAVE_OPENCL
 101     Ptr<OCL4DNNPool<float> > poolOp;
 102 #endif
 103
 104     void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
 105     {
 106         CV_Assert(!inputs.empty());
 107
 108         cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
 109                 out(outputs[0].size[3], outputs[0].size[2]);
 110
 111         if(globalPooling)
 112         {
 113             kernel = inp;
 114         }
 115
 116         getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad);
 117     }
 118
 119     virtual bool supportBackend(int backendId)
 120     {
 121         return backendId == DNN_BACKEND_DEFAULT ||
 122                backendId == DNN_BACKEND_HALIDE && haveHalide() &&
 123                (type == MAX || type == AVE && !pad.width && !pad.height);
 124     }
 125
 126 #ifdef HAVE_OPENCL
 127     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
 128     {
 129         std::vector<UMat> inputs;
 130         std::vector<UMat> outputs;
 131
 132         inps.getUMatVector(inputs);
 133         outs.getUMatVector(outputs);
 134
 135         if (poolOp.empty())
 136         {
 137             OCL4DNNPoolConfig config;
 138
 139             config.in_shape = shape(inputs[0]);
 140             config.out_shape = shape(outputs[0]);
 141             config.kernel = kernel;
 142             config.pad = pad;
 143             config.stride = stride;
 144             config.channels = inputs[0].size[1];
 145             config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
 146                                 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
 147                                                LIBDNN_POOLING_METHOD_STO);
 148             poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
 149         }
 150
 151         for (size_t ii = 0; ii < inputs.size(); ii++)
 152         {
 153             UMat& inpMat = inputs[ii];
 154             int out_index = (type == MAX) ? 2 : 1;
 155             UMat& outMat = outputs[out_index * ii];
 156             UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat();
 157
 158             CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
 159
 160             if (!poolOp->Forward(inpMat, outMat, maskMat))
 161                 return false;
 162         }
 163
 164         return true;
 165     }
 166 #endif
 167
 168     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
 169     {
 170         CV_TRACE_FUNCTION();
 171         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 172
 173         CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
 174                    OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
 175                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 176
 177         Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
 178     }
 179
 180     void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
 181     {
 182         CV_TRACE_FUNCTION();
 183         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 184
 185         switch (type)
 186         {
 187             case MAX:
 188                 CV_Assert(inputs.size() == 1, outputs.size() == 2);
 189                 maxPooling(*inputs[0], outputs[0], outputs[1]);
 190                 break;
 191             case AVE:
 192                 CV_Assert(inputs.size() == 1, outputs.size() == 1);
 193                 avePooling(*inputs[0], outputs[0]);
 194                 break;
 195             case ROI:
 196                 CV_Assert(inputs.size() == 2, outputs.size() == 1);
 197                 roiPooling(*inputs[0], *inputs[1], outputs[0]);
 198                 break;
 199             default:
 200                 CV_Error(Error::StsNotImplemented, "Not implemented");
 201                 break;
 202         }
 203     }
 204
 205     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
 206     {
 207         if (type == MAX)
 208             return initMaxPoolingHalide(inputs);
 209         else if (type == AVE)
 210             return initAvePoolingHalide(inputs);
 211         else
 212             return Ptr<BackendNode>();
 213     }
 214
 215     class PoolingInvoker : public ParallelLoopBody
 216     {
 217     public:
 218         const Mat* src, *rois;
 219         Mat *dst, *mask;
 220         Size kernel, stride, pad;
 221         int nstripes;
 222         bool computeMaxIdx;
 223         std::vector<int> ofsbuf;
 224         int poolingType;
 225         float spatialScale;
 226
 227         PoolingInvoker() : src(0), rois(0), dst(0), mask(0), nstripes(0),
 228                            computeMaxIdx(0), poolingType(MAX), spatialScale(0) {}
 229
 230         static void run(const Mat& src, const Mat& rois, Mat& dst, Mat& mask, Size kernel,
 231                         Size stride, Size pad, int poolingType, float spatialScale,
 232                         bool computeMaxIdx, int nstripes)
 233         {
 234             CV_Assert(src.isContinuous() && dst.isContinuous() &&
 235                       src.type() == CV_32F && src.type() == dst.type() &&
 236                       src.dims == 4 && dst.dims == 4 &&
 237                       (poolingType == ROI && dst.size[0] == rois.size[0] ||
 238                        src.size[0] == dst.size[0]) && src.size[1] == dst.size[1] &&
 239                       (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
 240
 241             PoolingInvoker p;
 242
 243             p.src = &src;
 244             p.rois = &rois;
 245             p.dst = &dst;
 246             p.mask = &mask;
 247             p.kernel = kernel;
 248             p.stride = stride;
 249             p.pad = pad;
 250             p.nstripes = nstripes;
 251             p.computeMaxIdx = computeMaxIdx;
 252             p.poolingType = poolingType;
 253             p.spatialScale = spatialScale;
 254
 255             if( !computeMaxIdx )
 256             {
 257                 p.ofsbuf.resize(kernel.width*kernel.height);
 258                 for( int i = 0; i < kernel.height; i++ )
 259                     for( int j = 0; j < kernel.width; j++ )
 260                         p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
 261             }
 262
 263             parallel_for_(Range(0, nstripes), p, nstripes);
 264         }
 265
 266         void operator()(const Range& r) const
 267         {
 268             int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
 269             int inp_width = src->size[3], inp_height = src->size[2];
 270             size_t total = dst->total();
 271             size_t stripeSize = (total + nstripes - 1)/nstripes;
 272             size_t stripeStart = r.start*stripeSize;
 273             size_t stripeEnd = std::min(r.end*stripeSize, total);
 274             int kernel_w = kernel.width, kernel_h = kernel.height;
 275             int pad_w = pad.width, pad_h = pad.height;
 276             int stride_w = stride.width, stride_h = stride.height;
 277             bool compMaxIdx = computeMaxIdx;
 278
 279 #if CV_SIMD128
 280             const int* ofsptr = &ofsbuf[0];
 281             v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
 282             v_float32x4 ones = v_setall_f32(1.f);
 283             v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
 284 #endif
 285
 286             for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
 287             {
 288                 size_t ofs = ofs0;
 289                 int x0 = (int)(ofs % width);
 290                 ofs /= width;
 291                 int y0 = (int)(ofs % height);
 292                 ofs /= height;
 293                 int c = (int)(ofs % channels);
 294                 int n = (int)(ofs / channels);
 295                 int ystart, yend;
 296
 297                 const float *srcData;
 298                 if (poolingType == ROI)
 299                 {
 300                     const float *roisData = rois->ptr<float>(n);
 301                     int ystartROI = scaleAndRoundRoi(roisData[2], spatialScale);
 302                     int yendROI = scaleAndRoundRoi(roisData[4], spatialScale);
 303                     int roiHeight = std::max(yendROI - ystartROI + 1, 1);
 304                     float roiRatio = (float)roiHeight / height;
 305
 306                     ystart = ystartROI + y0 * roiRatio;
 307                     yend = ystartROI + std::ceil((y0 + 1) * roiRatio);
 308
 309                     CV_Assert(roisData[0] < src->size[0]);
 310                     srcData = src->ptr<float>(roisData[0], c);
 311                 }
 312                 else
 313                 {
 314                     ystart = y0 * stride_h - pad_h;
 315                     yend = min(ystart + kernel_h, inp_height + pad_h);
 316                     srcData = src->ptr<float>(n, c);
 317                 }
 318                 int ydelta = yend - ystart;
 319                 ystart = max(ystart, 0);
 320                 yend = min(yend, inp_height);
 321                 float *dstData = dst->ptr<float>(n, c, y0);
 322                 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
 323
 324                 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
 325                 ofs0 += delta;
 326                 int x1 = x0 + delta;
 327
 328                 if( poolingType == MAX)
 329                     for( ; x0 < x1; x0++ )
 330                     {
 331                         int xstart = x0 * stride_w - pad_w;
 332                         int xend = min(xstart + kernel_w, inp_width);
 333                         xstart = max(xstart, 0);
 334                         if (xstart >= xend || ystart >= yend)
 335                         {
 336                             dstData[x0] = 0;
 337                             if (compMaxIdx && dstMaskData)
 338                                 dstMaskData[x0] = -1;
 339                             continue;
 340                         }
 341 #if CV_SIMD128
 342                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
 343                         {
 344                             if( compMaxIdx )
 345                             {
 346                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
 347                                 v_float32x4 max_val1 = max_val0;
 348                                 v_float32x4 max_idx0 = v_setall_f32(-1.f);
 349                                 v_float32x4 max_idx1 = max_idx0;
 350                                 int index0 = ystart * inp_width + xstart;
 351                                 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
 352                                 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
 353
 354                                 for (int y = ystart; y < yend; ++y)
 355                                 {
 356                                     for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
 357                                     {
 358                                         const int index = y * inp_width + x;
 359                                         v_float32x4 v0(srcData[index], srcData[index + stride_w],
 360                                                        srcData[index + stride_w*2], srcData[index + stride_w*3]);
 361                                         v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
 362                                                        srcData[index + stride_w*6], srcData[index + stride_w*7]);
 363                                         max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
 364                                         max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
 365                                         max_val0 = v_max(max_val0, v0);
 366                                         max_val1 = v_max(max_val1, v1);
 367                                     }
 368                                     idx0 += idx_delta;
 369                                     idx1 += idx_delta;
 370                                 }
 371                                 v_store(dstData + x0, max_val0);
 372                                 v_store(dstData + x0 + 4, max_val1);
 373                                 if (dstMaskData)
 374                                 {
 375                                     v_store(dstMaskData + x0, max_idx0);
 376                                     v_store(dstMaskData + x0 + 4, max_idx1);
 377                                 }
 378                                 x0 += 7;
 379                             }
 380                             else
 381                             {
 382                                 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
 383                                 v_float32x4 max_val1 = max_val0;
 384
 385                                 if( yend - ystart == kernel_h )
 386                                 {
 387                                     const float* srcData1 = srcData + ystart*inp_width + xstart;
 388                                     if( stride_w == 1 )
 389                                         for (int k = 0; k < kernel_w*kernel_h; k++)
 390                                         {
 391                                             int index = ofsptr[k];
 392                                             v_float32x4 v0 = v_load(srcData1 + index);
 393                                             v_float32x4 v1 = v_load(srcData1 + index + 4);
 394                                             max_val0 = v_max(max_val0, v0);
 395                                             max_val1 = v_max(max_val1, v1);
 396                                         }
 397 #if CV_SSE2
 398                                     else if( stride_w == 2 )
 399                                         for (int k = 0; k < kernel_w*kernel_h; k++)
 400                                         {
 401                                             int index = ofsptr[k];
 402                                             v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
 403                                             v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
 404                                             v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
 405                                             v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
 406                                             max_val0 = v_max(max_val0, v0);
 407                                             max_val1 = v_max(max_val1, v1);
 408                                         }
 409 #endif
 410                                     else
 411                                         for (int k = 0; k < kernel_w*kernel_h; k++)
 412                                         {
 413                                             int index = ofsptr[k];
 414                                             v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
 415                                                            srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
 416                                             v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
 417                                                            srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
 418                                             max_val0 = v_max(max_val0, v0);
 419                                             max_val1 = v_max(max_val1, v1);
 420                                         }
 421                                 }
 422                                 else
 423                                 {
 424                                     for (int y = ystart; y < yend; ++y)
 425                                     {
 426                                         for (int x = xstart; x < xend; ++x)
 427                                         {
 428                                             const int index = y * inp_width + x;
 429                                             v_float32x4 v0(srcData[index], srcData[index + stride_w],
 430                                                            srcData[index + stride_w*2], srcData[index + stride_w*3]);
 431                                             v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
 432                                                            srcData[index + stride_w*6], srcData[index + stride_w*7]);
 433                                             max_val0 = v_max(max_val0, v0);
 434                                             max_val1 = v_max(max_val1, v1);
 435                                         }
 436                                     }
 437                                 }
 438                                 v_store(dstData + x0, max_val0);
 439                                 v_store(dstData + x0 + 4, max_val1);
 440                                 x0 += 7;
 441                             }
 442                         }
 443                         else
 444 #endif
 445                         {
 446                             float max_val = -FLT_MAX;
 447                             if( compMaxIdx )
 448                             {
 449                                 int max_index = -1;
 450                                 for (int y = ystart; y < yend; ++y)
 451                                     for (int x = xstart; x < xend; ++x)
 452                                     {
 453                                         const int index = y * inp_width + x;
 454                                         float val = srcData[index];
 455                                         if (val > max_val)
 456                                         {
 457                                             max_val = val;
 458                                             max_index = index;
 459                                         }
 460                                     }
 461
 462                                 dstData[x0] = max_val;
 463                                 if (dstMaskData)
 464                                     dstMaskData[x0] = max_index;
 465                             }
 466                             else
 467                             {
 468                                 for (int y = ystart; y < yend; ++y)
 469                                     for (int x = xstart; x < xend; ++x)
 470                                     {
 471                                         const int index = y * inp_width + x;
 472                                         float val = srcData[index];
 473                                         max_val = std::max(max_val, val);
 474                                     }
 475
 476                                 dstData[x0] = max_val;
 477                             }
 478                         }
 479                     }
 480                 else if (poolingType == AVE)
 481                 {
 482                     for( ; x0 < x1; x0++ )
 483                     {
 484                         int xstart = x0 * stride_w - pad_w;
 485                         int xend = min(xstart + kernel_w, inp_width + pad_w);
 486                         int xdelta = xend - xstart;
 487                         xstart = max(xstart, 0);
 488                         xend = min(xend, inp_width);
 489                         float inv_kernel_area = 1.f/(ydelta*xdelta);
 490
 491 #if CV_SIMD128
 492                         if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
 493                         {
 494                             v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
 495                             v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
 496
 497                             for (int y = ystart; y < yend; ++y)
 498                             {
 499                                 for (int x = xstart; x < xend; ++x)
 500                                 {
 501                                     const int index = y * inp_width + x;
 502                                     v_float32x4 v0(srcData[index], srcData[index + stride_w],
 503                                                    srcData[index + stride_w*2], srcData[index + stride_w*3]);
 504                                     v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
 505                                                    srcData[index + stride_w*6], srcData[index + stride_w*7]);
 506                                     sum_val0 += v0;
 507                                     sum_val1 += v1;
 508                                 }
 509                             }
 510                             v_store(dstData + x0, sum_val0*ikarea);
 511                             v_store(dstData + x0 + 4, sum_val1*ikarea);
 512                             x0 += 7;
 513                         }
 514                         else
 515 #endif
 516                         {
 517                             float sum_val = 0.f;
 518                             for (int y = ystart; y < yend; ++y)
 519                                 for (int x = xstart; x < xend; ++x)
 520                                 {
 521                                     const int index = y * inp_width + x;
 522                                     float val = srcData[index];
 523                                     sum_val += val;
 524                                 }
 525
 526                             dstData[x0] = sum_val*inv_kernel_area;
 527                         }
 528                     }
 529                 }
 530                 else  // ROI
 531                 {
 532                     const float *roisData = rois->ptr<float>(n);
 533                     int xstartROI = scaleAndRoundRoi(roisData[1], spatialScale);
 534                     int xendROI = scaleAndRoundRoi(roisData[3], spatialScale);
 535                     int roiWidth = std::max(xendROI - xstartROI + 1, 1);
 536                     float roiRatio = (float)roiWidth / width;
 537                     for( ; x0 < x1; x0++ )
 538                     {
 539                         int xstart = xstartROI + x0 * roiRatio;
 540                         int xend = xstartROI + std::ceil((x0 + 1) * roiRatio);
 541                         xstart = max(xstart, 0);
 542                         xend = min(xend, inp_width);
 543                         if (xstart >= xend || ystart >= yend)
 544                         {
 545                             dstData[x0] = 0;
 546                             if (compMaxIdx && dstMaskData)
 547                                 dstMaskData[x0] = -1;
 548                             continue;
 549                         }
 550                         float max_val = -FLT_MAX;
 551                         for (int y = ystart; y < yend; ++y)
 552                             for (int x = xstart; x < xend; ++x)
 553                             {
 554                                 const int index = y * inp_width + x;
 555                                 float val = srcData[index];
 556                                 max_val = std::max(max_val, val);
 557                             }
 558                         dstData[x0] = max_val;
 559                     }
 560                 }
 561             }
 562         }
 563     };
 564
 565     void maxPooling(Mat &src, Mat &dst, Mat &mask)
 566     {
 567         const int nstripes = getNumThreads();
 568         Mat rois;
 569         PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
 570     }
 571
 572     void avePooling(Mat &src, Mat &dst)
 573     {
 574         const int nstripes = getNumThreads();
 575         Mat rois, mask;
 576         PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
 577     }
 578
 579     void roiPooling(const Mat &src, const Mat &rois, Mat &dst)
 580     {
 581         const int nstripes = getNumThreads();
 582         Mat mask;
 583         PoolingInvoker::run(src, rois, dst, mask, kernel, stride, pad, type, spatialScale, computeMaxIdx, nstripes);
 584     }
 585
 586     virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
 587     {
 588 #ifdef HAVE_HALIDE
 589         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
 590         const int inWidth = inputBuffer.width();
 591         const int inHeight = inputBuffer.height();
 592
 593         Halide::Var x("x"), y("y"), c("c"), n("n");
 594         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
 595         Halide::RDom r(0, kernel.width, 0, kernel.height);
 596         Halide::Expr kx, ky;
 597         if (pad.width || pad.height)
 598         {
 599             kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
 600             ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
 601         }
 602         else
 603         {
 604             kx = min(x * stride.width + r.x, inWidth - 1);
 605             ky = min(y * stride.height + r.y, inHeight - 1);
 606         }
 607
 608         // Halide::argmax returns tuple (r.x, r.y, max).
 609         Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
 610
 611         // Compute offset from argmax in range [0, kernel_size).
 612         Halide::Expr max_index;
 613         if (pad.width || pad.height)
 614         {
 615             max_index = clamp(y * stride.height + res[1] - pad.height,
 616                               0, inHeight - 1) * inWidth +
 617                         clamp(x * stride.width + res[0] - pad.width,
 618                               0, inWidth - 1);
 619         }
 620         else
 621         {
 622             max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
 623                         min(x * stride.width + res[0], inWidth - 1);
 624         }
 625         top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
 626         return Ptr<BackendNode>(new HalideBackendNode(top));
 627 #endif  // HAVE_HALIDE
 628         return Ptr<BackendNode>();
 629     }
 630
 631     virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
 632     {
 633 #ifdef HAVE_HALIDE
 634         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
 635
 636         const int inW = inputBuffer.width(), inH = inputBuffer.height();
 637         if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
 638         {
 639             CV_Error(cv::Error::StsNotImplemented,
 640                      "Halide backend for average pooling with partial "
 641                      "kernels is not implemented");
 642         }
 643
 644         const float norm = 1.0f / (kernel.width * kernel.height);
 645
 646         Halide::Var x("x"), y("y"), c("c"), n("n");
 647         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
 648         Halide::RDom r(0, kernel.width, 0, kernel.height);
 649         top(x, y, c, n) = sum(
 650             inputBuffer(x * stride.width + r.x,
 651                         y * stride.height + r.y, c, n)) * norm;
 652         return Ptr<BackendNode>(new HalideBackendNode(top));
 653 #endif  // HAVE_HALIDE
 654         return Ptr<BackendNode>();
 655     }
 656
 657     virtual void applyHalideScheduler(Ptr<BackendNode>& node,
 658                                       const std::vector<Mat*> &inputs,
 659                                       const std::vector<Mat> &outputs,
 660                                       int targetId) const
 661     {
 662 #ifdef  HAVE_HALIDE
 663         if (targetId != DNN_TARGET_CPU)
 664         {
 665             Layer::applyHalideScheduler(node, inputs, outputs, targetId);
 666             return;
 667         }
 668         Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
 669                     xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
 670         Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
 671
 672         int outW, outH, outC, outN;
 673         getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
 674
 675         if (outW < 8 || outH < 8)
 676         {
 677             if (outC > 8)
 678                 top.split(c, co, ci, 8)
 679                    .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
 680                    .parallel(tile)
 681                    .vectorize(ci);
 682             else
 683             {
 684                 top.fuse(y, c, tile).fuse(n, tile, tile)
 685                    .parallel(tile);
 686                 if (outW > 1)
 687                     top.vectorize(x);
 688             }
 689         }
 690         else
 691         {
 692             if (outC > 8)
 693                 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
 694                    .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
 695                    .parallel(tile)
 696                    .vectorize(xi);
 697             else
 698                 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
 699                    .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
 700                    .parallel(tile)
 701                    .vectorize(xi);
 702         }
 703 #endif  // HAVE_HALIDE
 704     }
 705
 706     bool getMemoryShapes(const std::vector<MatShape> &inputs,
 707                          const int requiredOutputs,
 708                          std::vector<MatShape> &outputs,
 709                          std::vector<MatShape> &internals) const
 710     {
 711         CV_Assert(inputs.size() != 0);
 712         Size in(inputs[0][3], inputs[0][2]), out;
 713
 714         if (globalPooling)
 715         {
 716             out.height = 1;
 717             out.width = 1;
 718         }
 719         else if (type == ROI)
 720         {
 721             out.height = pooledSize.height;
 722             out.width = pooledSize.width;
 723         }
 724         else if (padMode.empty())
 725         {
 726             float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height;
 727             float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width;
 728             out.height = 1 + (ceilMode ? ceil(height) : floor(height));
 729             out.width = 1 + (ceilMode ? ceil(width) : floor(width));
 730
 731             if (pad.height || pad.width)
 732             {
 733                 // If we have padding, ensure that the last pooling starts strictly
 734                 // inside the image (instead of at the padding); otherwise clip the last.
 735                 if ((out.height - 1) * stride.height >= in.height + pad.height)
 736                     --out.height;
 737                 if ((out.width - 1) * stride.width >= in.width + pad.width)
 738                     --out.width;
 739                 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
 740                 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
 741             }
 742         }
 743         else
 744         {
 745             getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
 746         }
 747
 748         int dims[] = {inputs[0][0], inputs[0][1], out.height, out.width};
 749         if (type == ROI)
 750         {
 751             CV_Assert(inputs.size() == 2);
 752             dims[0] = inputs[1][0];  // Number of proposals;
 753         }
 754         outputs.assign(type == MAX ? 2 : 1, shape(dims));
 755         return false;
 756     }
 757
 758     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
 759                            const std::vector<MatShape> &outputs) const
 760     {
 761         (void)inputs; // suppress unused variable warning
 762         long flops = 0;
 763
 764         for(int i = 0; i < outputs.size(); i++)
 765         {
 766             if (type == MAX)
 767             {
 768                 if (i%2 == 0)
 769                     flops += total(outputs[i])*kernel.area();
 770             }
 771             else
 772             {
 773                 flops += total(outputs[i])*(kernel.area() + 1);
 774             }
 775         }
 776         return flops;
 777     }
 778 private:
 779     enum Type
 780     {
 781         MAX,
 782         AVE,
 783         STOCHASTIC,
 784         ROI
 785     };
 786 };
 787
 788 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
 789 {
 790     return Ptr<PoolingLayer>(new PoolingLayerImpl(params));
 791 }
 792
 793 }
 794 }