1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "op_halide.hpp"
47 #include "opencl_kernels_dnn.hpp"
53 using namespace cv::dnn::ocl4dnn;
61 class PoolingLayerImpl : public PoolingLayer
64 PoolingLayerImpl(const LayerParams& params)
66 type = PoolingLayer::MAX;
69 if (params.has("pool"))
71 String pool = params.get<String>("pool").toLowerCase();
73 type = PoolingLayer::MAX;
74 else if (pool == "ave")
75 type = PoolingLayer::AVE;
76 else if (pool == "stochastic")
77 type = PoolingLayer::STOCHASTIC;
79 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
82 getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
83 pad.height, pad.width, stride.height, stride.width, padMode);
84 setParamsFrom(params);
85 ceilMode = params.get<bool>("ceil_mode", true);
89 Ptr<OCL4DNNPool<float> > poolOp;
92 void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
94 CV_Assert(inputs.size() == 1);
96 cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
97 out(outputs[0].size[3], outputs[0].size[2]);
104 getConvPoolPaddings(inp, out, kernel, stride, padMode, Size(1, 1), pad);
107 virtual bool supportBackend(int backendId)
109 return backendId == DNN_BACKEND_DEFAULT ||
110 backendId == DNN_BACKEND_HALIDE && haveHalide() &&
111 (type == PoolingLayer::MAX ||
112 type == PoolingLayer::AVE && !pad.width && !pad.height);
116 bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, InputArrayOfArrays internals)
118 std::vector<UMat> inputs;
119 std::vector<UMat> outputs;
121 inps.getUMatVector(inputs);
122 outs.getUMatVector(outputs);
126 OCL4DNNPoolConfig config;
128 config.in_shape = shape(inputs[0]);
129 config.out_shape = shape(outputs[0]);
130 config.kernel = kernel;
132 config.stride = stride;
133 config.channels = inputs[0].size[1];
134 config.pool_method = type == MAX ? LIBDNN_POOLING_METHOD_MAX :
135 (type == AVE ? LIBDNN_POOLING_METHOD_AVE :
136 LIBDNN_POOLING_METHOD_STO);
137 poolOp = Ptr<OCL4DNNPool<float> >(new OCL4DNNPool<float>(config));
140 for (size_t ii = 0; ii < inputs.size(); ii++)
142 UMat& inpMat = inputs[ii];
143 int out_index = (type == MAX) ? 2 : 1;
144 UMat& outMat = outputs[out_index * ii];
145 UMat maskMat = (type == MAX) ? outputs[2 * ii + 1] : UMat();
147 CV_Assert(inpMat.offset == 0 && outMat.offset == 0);
149 if (!poolOp->Forward(inpMat, outMat, maskMat))
157 void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
160 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
162 CV_OCL_RUN((preferableTarget == DNN_TARGET_OPENCL) &&
163 OCL_PERFORMANCE_CHECK(ocl::Device::getDefault().isIntel()),
164 forward_ocl(inputs_arr, outputs_arr, internals_arr))
166 Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
169 void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
172 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
174 for (size_t ii = 0; ii < inputs.size(); ii++)
179 maxPooling(*inputs[ii], outputs[2 * ii], outputs[2 * ii + 1]);
182 avePooling(*inputs[ii], outputs[ii]);
185 CV_Error(Error::StsNotImplemented, "Not implemented");
191 virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
193 if (type == PoolingLayer::MAX)
194 return initMaxPoolingHalide(inputs);
195 else if (type == PoolingLayer::AVE)
196 return initAvePoolingHalide(inputs);
198 return Ptr<BackendNode>();
201 class PoolingInvoker : public ParallelLoopBody
206 Size kernel, stride, pad;
209 std::vector<int> ofsbuf;
212 PoolingInvoker() : src(0), dst(0), mask(0), nstripes(0), computeMaxIdx(0), poolingType(PoolingLayer::MAX) {}
214 static void run(const Mat& src, Mat& dst, Mat& mask, Size kernel,
215 Size stride, Size pad, int poolingType,
216 bool computeMaxIdx, int nstripes)
218 CV_Assert(src.isContinuous() && dst.isContinuous() &&
219 src.type() == CV_32F && src.type() == dst.type() &&
220 src.dims == 4 && dst.dims == 4 &&
221 src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
222 (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
232 p.nstripes = nstripes;
233 p.computeMaxIdx = computeMaxIdx;
234 p.poolingType = poolingType;
238 p.ofsbuf.resize(kernel.width*kernel.height);
239 for( int i = 0; i < kernel.height; i++ )
240 for( int j = 0; j < kernel.width; j++ )
241 p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
244 parallel_for_(Range(0, nstripes), p, nstripes);
247 void operator()(const Range& r) const
249 int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
250 int inp_width = src->size[3], inp_height = src->size[2];
251 size_t total = dst->total();
252 size_t stripeSize = (total + nstripes - 1)/nstripes;
253 size_t stripeStart = r.start*stripeSize;
254 size_t stripeEnd = std::min(r.end*stripeSize, total);
255 int kernel_w = kernel.width, kernel_h = kernel.height;
256 int pad_w = pad.width, pad_h = pad.height;
257 int stride_w = stride.width, stride_h = stride.height;
258 bool compMaxIdx = computeMaxIdx;
261 const int* ofsptr = &ofsbuf[0];
262 v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
263 v_float32x4 ones = v_setall_f32(1.f);
264 v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
267 for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
270 int x0 = (int)(ofs % width);
272 int y0 = (int)(ofs % height);
274 int c = (int)(ofs % channels);
275 int n = (int)(ofs / channels);
276 int ystart = y0 * stride_h - pad_h;
277 int yend = min(ystart + kernel_h, inp_height + pad_h);
278 int ydelta = yend - ystart;
279 ystart = max(ystart, 0);
280 yend = min(yend, inp_height);
281 const float *srcData = src->ptr<float>(n, c);
282 float *dstData = dst->ptr<float>(n, c, y0);
283 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
285 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
289 if( poolingType == PoolingLayer::MAX )
290 for( ; x0 < x1; x0++ )
292 int xstart = x0 * stride_w - pad_w;
293 int xend = min(xstart + kernel_w, inp_width);
294 xstart = max(xstart, 0);
297 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
301 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
302 v_float32x4 max_val1 = max_val0;
303 v_float32x4 max_idx0 = v_setall_f32(-1.f);
304 v_float32x4 max_idx1 = max_idx0;
305 int index0 = ystart * inp_width + xstart;
306 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
307 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
309 for (int y = ystart; y < yend; ++y)
311 for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
313 const int index = y * inp_width + x;
314 v_float32x4 v0(srcData[index], srcData[index + stride_w],
315 srcData[index + stride_w*2], srcData[index + stride_w*3]);
316 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
317 srcData[index + stride_w*6], srcData[index + stride_w*7]);
318 max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
319 max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
320 max_val0 = v_max(max_val0, v0);
321 max_val1 = v_max(max_val1, v1);
326 v_store(dstData + x0, max_val0);
327 v_store(dstData + x0 + 4, max_val1);
330 v_store(dstMaskData + x0, max_idx0);
331 v_store(dstMaskData + x0 + 4, max_idx1);
337 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
338 v_float32x4 max_val1 = max_val0;
340 if( yend - ystart == kernel_h )
342 const float* srcData1 = srcData + ystart*inp_width + xstart;
344 for (int k = 0; k < kernel_w*kernel_h; k++)
346 int index = ofsptr[k];
347 v_float32x4 v0 = v_load(srcData1 + index);
348 v_float32x4 v1 = v_load(srcData1 + index + 4);
349 max_val0 = v_max(max_val0, v0);
350 max_val1 = v_max(max_val1, v1);
353 else if( stride_w == 2 )
354 for (int k = 0; k < kernel_w*kernel_h; k++)
356 int index = ofsptr[k];
357 v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
358 v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
359 v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
360 v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
361 max_val0 = v_max(max_val0, v0);
362 max_val1 = v_max(max_val1, v1);
366 for (int k = 0; k < kernel_w*kernel_h; k++)
368 int index = ofsptr[k];
369 v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
370 srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
371 v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
372 srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
373 max_val0 = v_max(max_val0, v0);
374 max_val1 = v_max(max_val1, v1);
379 for (int y = ystart; y < yend; ++y)
381 for (int x = xstart; x < xend; ++x)
383 const int index = y * inp_width + x;
384 v_float32x4 v0(srcData[index], srcData[index + stride_w],
385 srcData[index + stride_w*2], srcData[index + stride_w*3]);
386 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
387 srcData[index + stride_w*6], srcData[index + stride_w*7]);
388 max_val0 = v_max(max_val0, v0);
389 max_val1 = v_max(max_val1, v1);
393 v_store(dstData + x0, max_val0);
394 v_store(dstData + x0 + 4, max_val1);
401 float max_val = -FLT_MAX;
405 for (int y = ystart; y < yend; ++y)
406 for (int x = xstart; x < xend; ++x)
408 const int index = y * inp_width + x;
409 float val = srcData[index];
417 dstData[x0] = max_val;
419 dstMaskData[x0] = max_index;
423 for (int y = ystart; y < yend; ++y)
424 for (int x = xstart; x < xend; ++x)
426 const int index = y * inp_width + x;
427 float val = srcData[index];
428 max_val = std::max(max_val, val);
431 dstData[x0] = max_val;
437 for( ; x0 < x1; x0++ )
439 int xstart = x0 * stride_w - pad_w;
440 int xend = min(xstart + kernel_w, inp_width + pad_w);
441 int xdelta = xend - xstart;
442 xstart = max(xstart, 0);
443 xend = min(xend, inp_width);
444 float inv_kernel_area = 1.f/(ydelta*xdelta);
447 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
449 v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
450 v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
452 for (int y = ystart; y < yend; ++y)
454 for (int x = xstart; x < xend; ++x)
456 const int index = y * inp_width + x;
457 v_float32x4 v0(srcData[index], srcData[index + stride_w],
458 srcData[index + stride_w*2], srcData[index + stride_w*3]);
459 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
460 srcData[index + stride_w*6], srcData[index + stride_w*7]);
465 v_store(dstData + x0, sum_val0*ikarea);
466 v_store(dstData + x0 + 4, sum_val1*ikarea);
473 for (int y = ystart; y < yend; ++y)
474 for (int x = xstart; x < xend; ++x)
476 const int index = y * inp_width + x;
477 float val = srcData[index];
481 dstData[x0] = sum_val*inv_kernel_area;
489 void maxPooling(Mat &src, Mat &dst, Mat &mask)
491 const int nstripes = getNumThreads();
492 PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
495 void avePooling(Mat &src, Mat &dst)
497 const int nstripes = getNumThreads();
499 PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
502 virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
505 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
506 const int inWidth = inputBuffer.width();
507 const int inHeight = inputBuffer.height();
509 Halide::Var x("x"), y("y"), c("c"), n("n");
510 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
511 Halide::RDom r(0, kernel.width, 0, kernel.height);
513 if (pad.width || pad.height)
515 kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
516 ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
520 kx = min(x * stride.width + r.x, inWidth - 1);
521 ky = min(y * stride.height + r.y, inHeight - 1);
524 // Halide::argmax returns tuple (r.x, r.y, max).
525 Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
527 // Compute offset from argmax in range [0, kernel_size).
528 Halide::Expr max_index;
529 if (pad.width || pad.height)
531 max_index = clamp(y * stride.height + res[1] - pad.height,
532 0, inHeight - 1) * inWidth +
533 clamp(x * stride.width + res[0] - pad.width,
538 max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
539 min(x * stride.width + res[0], inWidth - 1);
541 top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
542 return Ptr<BackendNode>(new HalideBackendNode(top));
543 #endif // HAVE_HALIDE
544 return Ptr<BackendNode>();
547 virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
550 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
552 const int inW = inputBuffer.width(), inH = inputBuffer.height();
553 if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
555 CV_Error(cv::Error::StsNotImplemented,
556 "Halide backend for average pooling with partial "
557 "kernels is not implemented");
560 const float norm = 1.0f / (kernel.width * kernel.height);
562 Halide::Var x("x"), y("y"), c("c"), n("n");
563 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
564 Halide::RDom r(0, kernel.width, 0, kernel.height);
565 top(x, y, c, n) = sum(
566 inputBuffer(x * stride.width + r.x,
567 y * stride.height + r.y, c, n)) * norm;
568 return Ptr<BackendNode>(new HalideBackendNode(top));
569 #endif // HAVE_HALIDE
570 return Ptr<BackendNode>();
573 virtual void applyHalideScheduler(Ptr<BackendNode>& node,
574 const std::vector<Mat*> &inputs,
575 const std::vector<Mat> &outputs,
579 if (targetId != DNN_TARGET_CPU)
581 Layer::applyHalideScheduler(node, inputs, outputs, targetId);
584 Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
585 xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
586 Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
588 int outW, outH, outC, outN;
589 getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
591 if (outW < 8 || outH < 8)
594 top.split(c, co, ci, 8)
595 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
600 top.fuse(y, c, tile).fuse(n, tile, tile)
609 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
610 .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
614 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
615 .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
619 #endif // HAVE_HALIDE
622 bool getMemoryShapes(const std::vector<MatShape> &inputs,
623 const int requiredOutputs,
624 std::vector<MatShape> &outputs,
625 std::vector<MatShape> &internals) const
627 CV_Assert(inputs.size() != 0);
628 Size in(inputs[0][3], inputs[0][2]), out;
635 else if (padMode.empty())
637 float height = (float)(in.height + 2 * pad.height - kernel.height) / stride.height;
638 float width = (float)(in.width + 2 * pad.width - kernel.width) / stride.width;
639 out.height = 1 + (ceilMode ? ceil(height) : floor(height));
640 out.width = 1 + (ceilMode ? ceil(width) : floor(width));
642 if (pad.height || pad.width)
644 // If we have padding, ensure that the last pooling starts strictly
645 // inside the image (instead of at the padding); otherwise clip the last.
646 if ((out.height - 1) * stride.height >= in.height + pad.height)
648 if ((out.width - 1) * stride.width >= in.width + pad.width)
650 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
651 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
656 getConvPoolOutParams(in, kernel, stride, padMode, Size(1, 1), out);
659 outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
660 for (size_t i = 0; i < inputs.size(); i++)
662 size_t index = type == MAX ? 2*i : i;
663 int dims[] = {inputs[i][0], inputs[i][1], out.height, out.width};
664 outputs[index] = shape(dims);
667 outputs[index + 1] = shape(dims);
673 virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
674 const std::vector<MatShape> &outputs) const
676 (void)inputs; // suppress unused variable warning
679 for(int i = 0; i < outputs.size(); i++)
684 flops += total(outputs[i])*kernel.area();
688 flops += total(outputs[i])*(kernel.area() + 1);
695 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
697 return Ptr<PoolingLayer>(new PoolingLayerImpl(params));