1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "../op_cuda.hpp"
47 #include "../op_halide.hpp"
48 #include "../op_inf_engine.hpp"
49 #include "../op_vkcom.hpp"
57 #include "opencl_kernels_dnn.hpp"
58 using namespace cv::dnn::ocl4dnn;
62 #include "../cuda4dnn/primitives/pooling.hpp"
63 #include "../cuda4dnn/primitives/max_unpooling.hpp"
64 using namespace cv::dnn::cuda4dnn;
71 static inline int roundRoiSize(float v)
73 return (int)(v + (v >= 0.f ? 0.5f : -0.5f));
76 class PoolingLayerImpl CV_FINAL : public PoolingLayer
79 PoolingLayerImpl(const LayerParams& params)
82 globalPooling = false;
84 pad_t = pad_l = pad_b = pad_r = 0;
86 if (params.has("pool") || params.has("kernel_size") ||
87 params.has("kernel_w") || params.has("kernel_h"))
89 String pool = toLowerCase(params.get<String>("pool", "max"));
92 else if (pool == "ave")
94 else if (pool == "stochastic")
97 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
99 getPoolingKernelParams(params, kernel_size, globalPooling, pads_begin, pads_end, strides, padMode);
100 if (kernel_size.size() == 2) {
101 kernel = Size(kernel_size[1], kernel_size[0]);
102 stride = Size(strides[1], strides[0]);
103 pad = Size(pads_begin[1], pads_begin[0]);
105 pad_t = pads_begin[0];
106 pad_l = pads_begin[1];
111 else if (params.has("pooled_w") || params.has("pooled_h"))
114 pooledSize.width = params.get<uint32_t>("pooled_w", 1);
115 pooledSize.height = params.get<uint32_t>("pooled_h", 1);
117 else if (params.has("output_dim") && params.has("group_size"))
120 pooledSize.width = params.get<int>("group_size");
121 pooledSize.height = pooledSize.width;
122 psRoiOutChannels = params.get<int>("output_dim");
125 CV_Error(Error::StsBadArg, "Cannot determine pooling type");
126 setParamsFrom(params);
127 ceilMode = params.get<bool>("ceil_mode", true);
128 spatialScale = params.get<float>("spatial_scale", 1);
129 avePoolPaddedArea = params.get<bool>("ave_pool_padded_area", true);
133 Ptr<OCL4DNNPool<float> > poolOp;
136 void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
138 std::vector<Mat> inputs, outputs;
139 inputs_arr.getMatVector(inputs);
140 outputs_arr.getMatVector(outputs);
142 CV_Assert(!inputs.empty());
144 std::vector<int> inp;
145 std::vector<int> out;
146 for (int i = 2; i < inputs[0].dims; i++) {
147 inp.push_back(inputs[0].size[i]);
148 out.push_back(outputs[0].size[i]);
151 kernel = Size(inp[1], inp[0]);
152 kernel_size = std::vector<size_t>(inp.begin(), inp.end());
155 getConvPoolPaddings(inp, kernel_size, strides, padMode, pads_begin, pads_end);
156 if (pads_begin.size() == 2) {
157 pad_t = pads_begin[0];
158 pad_l = pads_begin[1];
166 computeMaxIdx = type == MAX && outputs.size() == 2;
169 virtual bool supportBackend(int backendId) CV_OVERRIDE
171 if (backendId == DNN_BACKEND_CUDA)
173 return type == MAX || type == AVE;
175 else if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
179 #ifdef HAVE_INF_ENGINE
180 if (kernel_size.size() == 3)
181 return preferableTarget == DNN_TARGET_CPU;
182 if (preferableTarget == DNN_TARGET_MYRIAD) {
183 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R1)
184 if (type == MAX && (pad_l == 1 && pad_t == 1) && stride == Size(2, 2) ) {
188 return type == MAX || type == AVE;
191 return type != STOCHASTIC;
198 if (kernel_size.size() == 3)
199 return (backendId == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU);
200 if (kernel_size.empty() || kernel_size.size() == 2)
201 return backendId == DNN_BACKEND_OPENCV ||
202 (backendId == DNN_BACKEND_HALIDE && haveHalide() &&
203 (type == MAX || (type == AVE && !pad_t && !pad_l && !pad_b && !pad_r))) ||
204 (backendId == DNN_BACKEND_VKCOM && haveVulkan() &&
205 (type == MAX || type == AVE));
212 bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
214 std::vector<UMat> inputs;
215 std::vector<UMat> outputs;
217 bool use_half = (inps.depth() == CV_16S);
218 inps.getUMatVector(inputs);
219 outs.getUMatVector(outputs);
223 OCL4DNNPoolConfig config;
225 config.in_shape = shape(inputs[0]);
226 config.out_shape = shape(outputs[0]);
227 config.kernel = kernel;
228 config.pad_l = pad_l;
229 config.pad_t = pad_t;
230 config.pad_r = pad_r;
231 config.pad_b = pad_b;
232 config.stride = stride;
233 config.channels = inputs[0].size[1];
234 config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
235 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
236 LIBDNN_POOLING_METHOD_STO);
237 config.avePoolPaddedArea = avePoolPaddedArea;
238 config.computeMaxIdx = computeMaxIdx;
239 config.use_half = use_half;
240 poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
243 CV_Assert_N(inputs.size() == 1, !outputs.empty(), !computeMaxIdx || outputs.size() == 2);
244 UMat& inpMat = inputs[0];
245 UMat& outMat = outputs[0];
246 UMat maskMat = computeMaxIdx ? outputs[1] : UMat();
248 CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
250 return poolOp->Forward(inpMat, outMat, maskMat);
254 void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
257 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
259 if (type == MAX || type == AVE || type == STOCHASTIC)
261 CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
262 forward_ocl(inputs_arr, outputs_arr, internals_arr))
264 if (inputs_arr.depth() == CV_16S)
266 forward_fallback(inputs_arr, outputs_arr, internals_arr);
270 std::vector<Mat> inputs, outputs;
271 inputs_arr.getMatVector(inputs);
272 outputs_arr.getMatVector(outputs);
278 CV_Assert_N(inputs.size() == 1, !computeMaxIdx || outputs.size() == 2);
279 Mat mask = computeMaxIdx ? outputs[1] : Mat();
280 maxPooling(inputs[0], outputs[0], mask);
284 CV_Assert_N(inputs.size() == 1, outputs.size() == 1);
285 avePooling(inputs[0], outputs[0]);
287 case ROI: case PSROI:
288 CV_Assert_N(inputs.size() == 2, outputs.size() == 1);
289 roiPooling(inputs[0], inputs[1], outputs[0]);
292 CV_Error(Error::StsNotImplemented, "Not implemented");
298 Ptr<BackendNode> initCUDA(
300 const std::vector<Ptr<BackendWrapper>>& inputs,
301 const std::vector<Ptr<BackendWrapper>>& outputs
304 auto context = reinterpret_cast<csl::CSLContext*>(context_);
306 auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
307 auto input_shape = input_wrapper->getShape();
309 /* storing max indices is a special case and we deal with it separately */
311 CV_Assert(type == MAX);
313 cuda4dnn::MaxPoolingConfiguration config;
314 config.window_size.assign(std::begin(kernel_size), std::end(kernel_size));
315 config.strides.assign(std::begin(strides), std::end(strides));
319 config.padMode = MaxPoolingConfiguration::PaddingMode::MANUAL;
320 config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
322 else if (padMode == "VALID")
324 config.padMode = MaxPoolingConfiguration::PaddingMode::VALID;
326 else if (padMode == "SAME")
328 config.padMode = MaxPoolingConfiguration::PaddingMode::SAME;
332 CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by PoolingLayer");
335 config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
337 return make_cuda_node<cuda4dnn::MaxPoolingOp>(preferableTarget, std::move(context->stream), config);
340 PoolingConfiguration config;
343 config.poolMode = PoolingConfiguration::PoolingMode::MAX;
345 else if (type == AVE && !avePoolPaddedArea)
347 config.poolMode = PoolingConfiguration::PoolingMode::AVERAGE_EXCLUDE_PADDING;
349 else if (type == AVE && avePoolPaddedArea)
351 config.poolMode = PoolingConfiguration::PoolingMode::AVERAGE_INCLUDE_PADDING;
355 CV_Error(Error::StsNotImplemented, "Unsupported pooling mode");
358 config.window_size.assign(std::begin(kernel_size), std::end(kernel_size));
359 config.strides.assign(std::begin(strides), std::end(strides));
363 config.padMode = PoolingConfiguration::PaddingMode::MANUAL;
364 config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
365 config.pads_end.assign(std::begin(pads_end), std::end(pads_end));
367 else if (padMode == "VALID")
369 config.padMode = PoolingConfiguration::PaddingMode::VALID;
371 else if (padMode == "SAME")
373 config.padMode = PoolingConfiguration::PaddingMode::SAME;
377 CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by PoolingLayer");
381 config.roundMode = PoolingConfiguration::RoundingMode::CEIL;
383 config.roundMode = PoolingConfiguration::RoundingMode::FLOOR;
385 config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
387 return make_cuda_node<cuda4dnn::PoolingOp>(preferableTarget, std::move(context->cudnn_handle), config);
391 virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
395 vkcom::PoolType pool_type;
396 int filter_size[2] = {kernel.height, kernel.width};
397 int pad_size[2] = {pad.height, pad.width};
398 int stride_size[2] = {stride.height, stride.width};
399 pool_type = type == MAX ? vkcom::kPoolTypeMax:
400 (type == AVE ? vkcom::kPoolTypeAvg:
401 vkcom::kPoolTypeNum);
405 padding_mode = vkcom::kPaddingModeCaffe;
407 else if (padMode == "VALID")
409 padding_mode = vkcom::kPaddingModeValid;
411 else if (padMode == "SAME")
413 padding_mode = vkcom::kPaddingModeSame;
416 CV_Error(Error::StsError, "Unsupported padding mode " + padMode);
418 std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPool(filter_size, pad_size,
419 stride_size, padding_mode,
420 pool_type, avePoolPaddedArea));
421 return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
423 return Ptr<BackendNode>();
426 virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
429 return initMaxPoolingHalide(inputs);
430 else if (type == AVE)
431 return initAvePoolingHalide(inputs);
433 return Ptr<BackendNode>();
436 #ifdef HAVE_INF_ENGINE
437 virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
439 if (type == MAX || type == AVE)
441 InferenceEngine::Builder::PoolingLayer ieLayer(name);
443 ieLayer.setKernel(kernel_size);
444 ieLayer.setStrides(strides);
445 ieLayer.setPaddingsBegin(pads_begin);
446 ieLayer.setPaddingsEnd(pads_end);
448 ieLayer.setPoolingType(type == MAX ?
449 InferenceEngine::Builder::PoolingLayer::PoolingType::MAX :
450 InferenceEngine::Builder::PoolingLayer::PoolingType::AVG);
451 ieLayer.setRoundingType(ceilMode ?
452 InferenceEngine::Builder::PoolingLayer::RoundingType::CEIL :
453 InferenceEngine::Builder::PoolingLayer::RoundingType::FLOOR);
454 ieLayer.setExcludePad(type == AVE && padMode == "SAME");
456 InferenceEngine::Builder::Layer l = ieLayer;
457 if (!padMode.empty())
458 l.getParameters()["auto_pad"] = padMode == "VALID" ? std::string("valid") : std::string("same_upper");
459 return Ptr<BackendNode>(new InfEngineBackendNode(l));
461 else if (type == ROI)
463 InferenceEngine::Builder::ROIPoolingLayer ieLayer(name);
464 ieLayer.setSpatialScale(spatialScale);
465 ieLayer.setPooled({pooledSize.height, pooledSize.width});
466 ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
467 return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
469 else if (type == PSROI)
471 InferenceEngine::Builder::PSROIPoolingLayer ieLayer(name);
472 ieLayer.setSpatialScale(spatialScale);
473 ieLayer.setOutputDim(psRoiOutChannels);
474 ieLayer.setGroupSize(pooledSize.width);
475 ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(2));
476 return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
479 CV_Error(Error::StsNotImplemented, "Unsupported pooling type");
480 return Ptr<BackendNode>();
482 #endif // HAVE_INF_ENGINE
485 class PoolingInvoker : public ParallelLoopBody
488 const Mat* src, *rois;
491 int pad_l, pad_t, pad_r, pad_b;
492 bool avePoolPaddedArea;
495 std::vector<int> ofsbuf;
499 std::vector<size_t> pads_begin, pads_end;
500 std::vector<size_t> kernel_size;
501 std::vector<size_t> strides;
503 PoolingInvoker() : src(0), rois(0), dst(0), mask(0), pad_l(0), pad_t(0), pad_r(0), pad_b(0),
504 avePoolPaddedArea(false), nstripes(0),
505 computeMaxIdx(0), poolingType(MAX), spatialScale(0) {}
507 static void run(const Mat& src, const Mat& rois, Mat& dst, Mat& mask,
508 std::vector<size_t> kernel_size, std::vector<size_t> strides,
509 std::vector<size_t> pads_begin, std::vector<size_t> pads_end,
510 bool avePoolPaddedArea, int poolingType, float spatialScale,
511 bool computeMaxIdx, int nstripes)
514 src.isContinuous(), dst.isContinuous(),
515 src.type() == CV_32F, src.type() == dst.type(),
516 src.dims == 4 || src.dims == 5, dst.dims == 4 || dst.dims == 5,
517 (((poolingType == ROI || poolingType == PSROI) &&
518 dst.size[0] == rois.size[0]) || src.size[0] == dst.size[0]),
519 poolingType == PSROI || src.size[1] == dst.size[1],
520 (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
528 p.kernel_size = kernel_size;
530 p.pads_begin = pads_begin;
531 p.pads_end = pads_end;
534 p.kernel = Size(kernel_size[1], kernel_size[0]);
535 p.stride = Size(strides[1], strides[0]);
536 p.pad_l = pads_begin.back();
537 p.pad_t = pads_begin[pads_begin.size() - 2];
538 p.pad_r = pads_end.back();
539 p.pad_b = pads_end[pads_end.size() - 2];
541 p.avePoolPaddedArea = avePoolPaddedArea;
542 p.nstripes = nstripes;
543 p.computeMaxIdx = computeMaxIdx;
544 p.poolingType = poolingType;
545 p.spatialScale = spatialScale;
549 int height = src.size[src.dims - 2];
550 int width = src.size[src.dims - 1];
552 int kernel_d = (kernel_size.size() == 3) ? kernel_size[0] : 1;
553 int kernel_h = kernel_size[kernel_size.size() - 2];
554 int kernel_w = kernel_size.back();
556 p.ofsbuf.resize(kernel_d * kernel_h * kernel_w);
557 for (int i = 0; i < kernel_d; ++i) {
558 for (int j = 0; j < kernel_h; ++j) {
559 for (int k = 0; k < kernel_w; ++k) {
560 p.ofsbuf[i * kernel_h * kernel_w + j * kernel_w + k] = width * height * i + width * j + k;
566 parallel_for_(Range(0, nstripes), p, nstripes);
569 void operator()(const Range& r) const CV_OVERRIDE
571 int channels = dst->size[1];
573 bool isPool2D = src->dims == 4;
574 int depth = !isPool2D? dst->size[2] : 1;
575 int height = dst->size[dst->dims - 2];
576 int width = dst->size[dst->dims - 1];
578 int inp_depth = !isPool2D? src->size[2] : 1;
579 int inp_height = src->size[src->dims - 2];
580 int inp_width = src->size[src->dims - 1];
582 size_t total = dst->total();
583 size_t stripeSize = (total + nstripes - 1)/nstripes;
584 size_t stripeStart = r.start*stripeSize;
585 size_t stripeEnd = std::min(r.end*stripeSize, total);
587 int kernel_d = !isPool2D? kernel_size[0] : 1;
588 int kernel_h = kernel_size[kernel_size.size() - 2];
589 int kernel_w = kernel_size.back();
591 int stride_d = !isPool2D? strides[0] : 0;
592 int stride_h = strides[strides.size() - 2];
593 int stride_w = strides.back();
594 bool compMaxIdx = computeMaxIdx;
597 const int* ofsptr = ofsbuf.empty() ? 0 : (const int*)&ofsbuf[0];
598 if (poolingType == MAX && !compMaxIdx && !ofsptr)
599 CV_Error(Error::StsBadArg, "ofsbuf should be initialized in this mode");
600 v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
601 v_float32x4 ones = v_setall_f32(1.f);
602 v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
605 for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
608 int x0 = (int)(ofs % width);
610 int y0 = (int)(ofs % height);
613 int d0 = (int)(ofs % depth);
616 int c = (int)(ofs % channels);
617 int n = (int)(ofs / channels);
619 int dstart = 0, dend = 1;
621 const float *srcData = 0;
622 if (poolingType == ROI)
624 const float *roisData = rois->ptr<float>(n);
625 int ystartROI = roundRoiSize(roisData[2] * spatialScale);
626 int yendROI = roundRoiSize(roisData[4] * spatialScale);
627 int roiHeight = std::max(yendROI - ystartROI + 1, 1);
628 float roiRatio = (float)roiHeight / height;
630 ystart = ystartROI + y0 * roiRatio;
631 yend = ystartROI + std::ceil((y0 + 1) * roiRatio);
633 CV_Assert(roisData[0] < src->size[0]);
634 srcData = src->ptr<float>(roisData[0], c);
636 else if (poolingType == PSROI)
638 const float *roisData = rois->ptr<float>(n);
639 float ystartROI = roundRoiSize(roisData[2]) * spatialScale;
640 float yendROI = roundRoiSize(roisData[4] + 1) * spatialScale;
641 float roiHeight = std::max(yendROI - ystartROI, 0.1f);
642 float roiRatio = roiHeight / height;
644 ystart = (int)std::floor(ystartROI + y0 * roiRatio);
645 yend = (int)std::ceil(ystartROI + (y0 + 1) * roiRatio);
649 int pad_d_begin = (pads_begin.size() == 3) ? pads_begin[0] : 0;
650 dstart = d0 * stride_d - pad_d_begin;
651 dend = min(dstart + kernel_d, (int)(inp_depth + pads_end[0]));
653 ystart = y0 * stride_h - pad_t;
654 yend = min(ystart + kernel_h, inp_height + pad_b);
655 srcData = src->ptr<float>(n, c);
657 int ddelta = dend - dstart;
658 dstart = max(dstart, 0);
659 dend = min(dend, inp_depth);
660 int ydelta = yend - ystart;
661 ystart = max(ystart, 0);
662 yend = min(yend, inp_height);
663 float *dstData = &dst->ptr<float>(n, c, d0)[y0 * width];
664 float *dstMaskData = mask->data ? &mask->ptr<float>(n, c, d0)[y0 * width] : 0;
666 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
670 if( poolingType == MAX)
671 for( ; x0 < x1; x0++ )
673 int xstart = x0 * stride_w - pad_l;
674 int xend = min(xstart + kernel_w, inp_width);
675 xstart = max(xstart, 0);
676 if (xstart >= xend || ystart >= yend)
679 if (compMaxIdx && dstMaskData)
680 dstMaskData[x0] = -1;
684 if( isPool2D && xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_l + kernel_w < inp_width )
688 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
689 v_float32x4 max_val1 = max_val0;
690 v_float32x4 max_idx0 = v_setall_f32(-1.f);
691 v_float32x4 max_idx1 = max_idx0;
692 int index0 = ystart * inp_width + xstart;
693 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
694 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
696 for (int y = ystart; y < yend; ++y)
698 for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
700 const int index = y * inp_width + x;
701 v_float32x4 v0(srcData[index], srcData[index + stride_w],
702 srcData[index + stride_w*2], srcData[index + stride_w*3]);
703 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
704 srcData[index + stride_w*6], srcData[index + stride_w*7]);
705 max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
706 max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
707 max_val0 = v_max(max_val0, v0);
708 max_val1 = v_max(max_val1, v1);
713 v_store(dstData + x0, max_val0);
714 v_store(dstData + x0 + 4, max_val1);
717 v_store(dstMaskData + x0, max_idx0);
718 v_store(dstMaskData + x0 + 4, max_idx1);
724 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
725 v_float32x4 max_val1 = max_val0;
727 if( yend - ystart == kernel_h )
729 const float* srcData1 = srcData + ystart*inp_width + xstart;
731 for (int k = 0; k < kernel_w*kernel_h; k++)
733 int index = ofsptr[k];
734 v_float32x4 v0 = v_load(srcData1 + index);
735 v_float32x4 v1 = v_load(srcData1 + index + 4);
736 max_val0 = v_max(max_val0, v0);
737 max_val1 = v_max(max_val1, v1);
739 else if( stride_w == 2 )
740 for (int k = 0; k < kernel_w*kernel_h; k++)
742 int index = ofsptr[k];
743 v_float32x4 v0, v1, dummy;
744 v_load_deinterleave(srcData1 + index, v0, dummy); // f0 f2 f4 f6 ,f1 f3 f5 f7
745 v_load_deinterleave(srcData1 + index + 8, v1, dummy); // f8 f10 f12 f14 ,f9 f11 f13 f15
746 max_val0 = v_max(max_val0, v0);
747 max_val1 = v_max(max_val1, v1);
750 for (int k = 0; k < kernel_w*kernel_h; k++)
752 int index = ofsptr[k];
753 v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
754 srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
755 v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
756 srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
757 max_val0 = v_max(max_val0, v0);
758 max_val1 = v_max(max_val1, v1);
763 for (int y = ystart; y < yend; ++y)
765 for (int x = xstart; x < xend; ++x)
767 const int index = y * inp_width + x;
768 v_float32x4 v0(srcData[index], srcData[index + stride_w],
769 srcData[index + stride_w*2], srcData[index + stride_w*3]);
770 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
771 srcData[index + stride_w*6], srcData[index + stride_w*7]);
772 max_val0 = v_max(max_val0, v0);
773 max_val1 = v_max(max_val1, v1);
777 v_store(dstData + x0, max_val0);
778 v_store(dstData + x0 + 4, max_val1);
785 float max_val = -FLT_MAX;
789 for (int d = dstart; d < dend; ++d)
790 for (int y = ystart; y < yend; ++y)
791 for (int x = xstart; x < xend; ++x)
793 const int index = d * inp_width * inp_height + y * inp_width + x;
794 float val = srcData[index];
801 dstData[x0] = max_val;
803 dstMaskData[x0] = max_index;
807 for (int d = dstart; d < dend; ++d) {
808 for (int y = ystart; y < yend; ++y) {
809 for (int x = xstart; x < xend; ++x) {
810 const int index = d * inp_width * inp_height + y * inp_width + x;
811 float val = srcData[index];
812 max_val = std::max(max_val, val);
816 dstData[x0] = max_val;
820 else if (poolingType == AVE)
822 for( ; x0 < x1; ++x0)
824 int xstart = x0 * stride_w - pad_l;
825 int xend = min(xstart + kernel_w, inp_width + pad_r);
826 int xdelta = xend - xstart;
827 xstart = max(xstart, 0);
828 xend = min(xend, inp_width);
829 float inv_kernel_area = avePoolPaddedArea ? xdelta * ydelta * ddelta :
830 ((dend - dstart) * (yend - ystart) * (xend - xstart));
831 inv_kernel_area = 1.0 / inv_kernel_area;
833 if( isPool2D && xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_l + kernel_w < inp_width )
835 v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
836 v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
838 for (int y = ystart; y < yend; ++y)
840 for (int x = xstart; x < xend; ++x)
842 const int index = y * inp_width + x;
843 v_float32x4 v0(srcData[index], srcData[index + stride_w],
844 srcData[index + stride_w*2], srcData[index + stride_w*3]);
845 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
846 srcData[index + stride_w*6], srcData[index + stride_w*7]);
851 v_store(dstData + x0, sum_val0*ikarea);
852 v_store(dstData + x0 + 4, sum_val1*ikarea);
859 for (int d = dstart; d < dend; ++d) {
860 for (int y = ystart; y < yend; ++y) {
861 for (int x = xstart; x < xend; ++x) {
862 const int index = d * inp_width * inp_height + y * inp_width + x;
863 float val = srcData[index];
868 dstData[x0] = sum_val*inv_kernel_area;
872 else if (poolingType == ROI)
874 const float *roisData = rois->ptr<float>(n);
875 int xstartROI = roundRoiSize(roisData[1] * spatialScale);
876 int xendROI = roundRoiSize(roisData[3] * spatialScale);
877 int roiWidth = std::max(xendROI - xstartROI + 1, 1);
878 float roiRatio = (float)roiWidth / width;
879 for( ; x0 < x1; x0++ )
881 int xstart = xstartROI + x0 * roiRatio;
882 int xend = xstartROI + std::ceil((x0 + 1) * roiRatio);
883 xstart = max(xstart, 0);
884 xend = min(xend, inp_width);
885 if (xstart >= xend || ystart >= yend)
888 if (compMaxIdx && dstMaskData)
889 dstMaskData[x0] = -1;
892 float max_val = -FLT_MAX;
893 for (int y = ystart; y < yend; ++y)
894 for (int x = xstart; x < xend; ++x)
896 const int index = y * inp_width + x;
897 float val = srcData[index];
898 max_val = std::max(max_val, val);
900 dstData[x0] = max_val;
905 const float *roisData = rois->ptr<float>(n);
906 CV_Assert(roisData[0] < src->size[0]);
907 float xstartROI = roundRoiSize(roisData[1]) * spatialScale;
908 float xendROI = roundRoiSize(roisData[3] + 1) * spatialScale;
909 float roiWidth = std::max(xendROI - xstartROI, 0.1f);
910 float roiRatio = roiWidth / width;
911 for( ; x0 < x1; x0++ )
913 int xstart = (int)std::floor(xstartROI + x0 * roiRatio);
914 int xend = (int)std::ceil(xstartROI + (x0 + 1) * roiRatio);
915 xstart = max(xstart, 0);
916 xend = min(xend, inp_width);
917 if (xstart >= xend || ystart >= yend)
923 srcData = src->ptr<float>(roisData[0], (c * height + y0) * width + x0);
925 for (int y = ystart; y < yend; ++y)
926 for (int x = xstart; x < xend; ++x)
928 const int index = y * inp_width + x;
929 float val = srcData[index];
932 dstData[x0] = sum_val / ((yend - ystart) * (xend - xstart));
939 void maxPooling(Mat &src, Mat &dst, Mat &mask)
941 const int nstripes = getNumThreads();
943 PoolingInvoker::run(src, rois, dst, mask, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, spatialScale, computeMaxIdx, nstripes);
946 void avePooling(Mat &src, Mat &dst)
948 const int nstripes = getNumThreads();
950 PoolingInvoker::run(src, rois, dst, mask, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, spatialScale, computeMaxIdx, nstripes);
953 void roiPooling(const Mat &src, const Mat &rois, Mat &dst)
955 const int nstripes = getNumThreads();
957 kernel_size.resize(2);
959 pads_begin.resize(2);
961 PoolingInvoker::run(src, rois, dst, mask, kernel_size, strides, pads_begin, pads_end, avePoolPaddedArea, type, spatialScale, computeMaxIdx, nstripes);
964 virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
967 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
968 const int inWidth = inputBuffer.width();
969 const int inHeight = inputBuffer.height();
971 Halide::Var x("x"), y("y"), c("c"), n("n");
972 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
973 Halide::RDom r(0, kernel.width, 0, kernel.height);
977 kx = clamp(x * stride.width + r.x - pad_l, 0, inWidth - 1);
978 ky = clamp(y * stride.height + r.y - pad_t, 0, inHeight - 1);
982 kx = min(x * stride.width + r.x, inWidth - 1);
983 ky = min(y * stride.height + r.y, inHeight - 1);
986 // Halide::argmax returns tuple (r.x, r.y, max).
987 Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
989 // Compute offset from argmax in range [0, kernel_size).
990 Halide::Expr max_index;
993 max_index = clamp(y * stride.height + res[1] - pad_t,
994 0, inHeight - 1) * inWidth +
995 clamp(x * stride.width + res[0] - pad_l,
1000 max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
1001 min(x * stride.width + res[0], inWidth - 1);
1003 top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
1004 return Ptr<BackendNode>(new HalideBackendNode(top));
1005 #endif // HAVE_HALIDE
1006 return Ptr<BackendNode>();
1009 virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
1012 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
1014 const int inW = inputBuffer.width(), inH = inputBuffer.height();
1015 if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
1017 CV_Error(cv::Error::StsNotImplemented,
1018 "Halide backend for average pooling with partial "
1019 "kernels is not implemented");
1022 const float norm = 1.0f / (kernel.width * kernel.height);
1024 Halide::Var x("x"), y("y"), c("c"), n("n");
1025 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
1026 Halide::RDom r(0, kernel.width, 0, kernel.height);
1027 top(x, y, c, n) = sum(
1028 inputBuffer(x * stride.width + r.x,
1029 y * stride.height + r.y, c, n)) * norm;
1030 return Ptr<BackendNode>(new HalideBackendNode(top));
1031 #endif // HAVE_HALIDE
1032 return Ptr<BackendNode>();
1035 virtual void applyHalideScheduler(Ptr<BackendNode>& node,
1036 const std::vector<Mat*> &inputs,
1037 const std::vector<Mat> &outputs,
1038 int targetId) const CV_OVERRIDE
1041 if (targetId != DNN_TARGET_CPU)
1043 Layer::applyHalideScheduler(node, inputs, outputs, targetId);
1046 Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
1047 xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
1048 Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
1050 int outW, outH, outC, outN;
1051 getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
1053 if (outW < 8 || outH < 8)
1056 top.split(c, co, ci, 8)
1057 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
1062 top.fuse(y, c, tile).fuse(n, tile, tile)
1071 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
1072 .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
1076 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
1077 .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
1081 #endif // HAVE_HALIDE
1084 bool getMemoryShapes(const std::vector<MatShape> &inputs,
1085 const int requiredOutputs,
1086 std::vector<MatShape> &outputs,
1087 std::vector<MatShape> &internals) const CV_OVERRIDE
1089 CV_Assert(inputs.size() != 0);
1091 std::vector<int> inpShape(inputs[0].begin() + 2, inputs[0].end());
1092 std::vector<int> outShape(inputs[0].begin(), inputs[0].begin() + 2);
1096 outShape.push_back(1);
1097 outShape.push_back(1);
1099 else if (type == ROI || type == PSROI)
1101 outShape.push_back(pooledSize.height);
1102 outShape.push_back(pooledSize.width);
1104 else if (padMode.empty())
1106 for (int i = 0; i < kernel_size.size(); i++) {
1107 float dst = (float)(inpShape[i] + pads_begin[i] + pads_end[i] - kernel_size[i]) / strides[i];
1108 outShape.push_back(1 + (ceilMode ? ceil(dst) : floor(dst)));
1111 // If we have padding, ensure that the last pooling starts strictly
1112 // inside the image (instead of at the padding); otherwise clip the last.
1113 for (int i = 0; i < pads_end.size(); i++) {
1114 if (pads_end[i] && (outShape[2 + i] - 1) * strides[i] >= inpShape[i] + pads_end[i]) {
1116 CV_Assert((outShape[2 + i] - 1) * strides[i] < inpShape[i] + pads_end[i]);
1122 getConvPoolOutParams(inpShape, kernel_size, strides, padMode, std::vector<size_t>(kernel_size.size(), 1), outShape);
1126 CV_Assert(inputs.size() == 2);
1127 outShape[0] = inputs[1][0]; // Number of proposals;
1129 else if (type == PSROI)
1131 CV_Assert(inputs.size() == 2);
1132 CV_Assert(psRoiOutChannels * pooledSize.width * pooledSize.height == inputs[0][1]);
1133 outShape[0] = inputs[1][0]; // Number of proposals;
1134 outShape[1] = psRoiOutChannels;
1136 int numOutputs = requiredOutputs ? requiredOutputs : (type == MAX ? 2 : 1);
1137 CV_Assert(numOutputs == 1 || (numOutputs == 2 && type == MAX));
1139 outputs.assign(numOutputs, outShape);
1144 virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
1145 const std::vector<MatShape> &outputs) const CV_OVERRIDE
1147 CV_UNUSED(inputs); // suppress unused variable warning
1149 size_t karea = std::accumulate(kernel_size.begin(), kernel_size.end(),
1150 1, std::multiplies<size_t>());
1151 for(int i = 0; i < outputs.size(); i++)
1156 flops += total(outputs[i])*karea;
1160 flops += total(outputs[i])*(karea + 1);
1171 ROI, // RoI pooling, https://arxiv.org/pdf/1504.08083.pdf
1172 PSROI // Position-sensitive RoI pooling, https://arxiv.org/pdf/1605.06409.pdf
1176 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
1178 return Ptr<PoolingLayer>(new PoolingLayerImpl(params));