1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "opencv2/core/hal/intrin.hpp"
46 #include "op_halide.hpp"
57 //TODO: add ceil_mode param
58 class PoolingLayerImpl : public PoolingLayer
61 PoolingLayerImpl(const LayerParams& params)
63 type = PoolingLayer::MAX;
66 if (params.has("pool"))
68 String pool = params.get<String>("pool").toLowerCase();
70 type = PoolingLayer::MAX;
71 else if (pool == "ave")
72 type = PoolingLayer::AVE;
73 else if (pool == "stochastic")
74 type = PoolingLayer::STOCHASTIC;
76 CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
79 getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
80 pad.height, pad.width, stride.height, stride.width, padMode);
81 setParamsFrom(params);
84 void finalize(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
86 CV_Assert(inputs.size() == 1);
88 cv::Size inp(inputs[0]->size[3], inputs[0]->size[2]),
89 out(outputs[0].size[3], outputs[0].size[2]);
96 getConvPoolPaddings(inp, out, kernel, stride, padMode, pad);
99 virtual bool supportBackend(int backendId)
101 return backendId == DNN_BACKEND_DEFAULT ||
102 backendId == DNN_BACKEND_HALIDE && haveHalide() &&
103 (type == PoolingLayer::MAX ||
104 type == PoolingLayer::AVE && !pad.width && !pad.height);
107 void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
110 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
112 for (size_t ii = 0; ii < inputs.size(); ii++)
117 maxPooling(*inputs[ii], outputs[2 * ii], outputs[2 * ii + 1]);
120 avePooling(*inputs[ii], outputs[ii]);
123 CV_Error(Error::StsNotImplemented, "Not implemented");
129 virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
131 if (type == PoolingLayer::MAX)
132 return initMaxPoolingHalide(inputs);
133 else if (type == PoolingLayer::AVE)
134 return initAvePoolingHalide(inputs);
136 return Ptr<BackendNode>();
139 class PoolingInvoker : public ParallelLoopBody
144 Size kernel, stride, pad;
147 std::vector<int> ofsbuf;
150 PoolingInvoker() : src(0), dst(0), mask(0), nstripes(0), computeMaxIdx(0), poolingType(PoolingLayer::MAX) {}
152 static void run(const Mat& src, Mat& dst, Mat& mask, Size kernel,
153 Size stride, Size pad, int poolingType,
154 bool computeMaxIdx, int nstripes)
156 CV_Assert(src.isContinuous() && dst.isContinuous() &&
157 src.type() == CV_32F && src.type() == dst.type() &&
158 src.dims == 4 && dst.dims == 4 &&
159 src.size[0] == dst.size[0] && src.size[1] == dst.size[1] &&
160 (mask.empty() || (mask.type() == src.type() && mask.size == dst.size)));
170 p.nstripes = nstripes;
171 p.computeMaxIdx = computeMaxIdx;
172 p.poolingType = poolingType;
176 p.ofsbuf.resize(kernel.width*kernel.height);
177 for( int i = 0; i < kernel.height; i++ )
178 for( int j = 0; j < kernel.width; j++ )
179 p.ofsbuf[i*kernel.width + j] = src.size[3]*i + j;
182 parallel_for_(Range(0, nstripes), p, nstripes);
185 void operator()(const Range& r) const
187 int channels = dst->size[1], width = dst->size[3], height = dst->size[2];
188 int inp_width = src->size[3], inp_height = src->size[2];
189 size_t total = dst->total();
190 size_t stripeSize = (total + nstripes - 1)/nstripes;
191 size_t stripeStart = r.start*stripeSize;
192 size_t stripeEnd = std::min(r.end*stripeSize, total);
193 int kernel_w = kernel.width, kernel_h = kernel.height;
194 int pad_w = pad.width, pad_h = pad.height;
195 int stride_w = stride.width, stride_h = stride.height;
196 bool compMaxIdx = computeMaxIdx;
199 const int* ofsptr = &ofsbuf[0];
200 v_float32x4 idx00(0.f, (float)stride_w, (float)(stride_w*2), (float)(stride_w*3));
201 v_float32x4 ones = v_setall_f32(1.f);
202 v_float32x4 idx_delta = v_setall_f32((float)(inp_width - kernel_w));
205 for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
208 int x0 = (int)(ofs % width);
210 int y0 = (int)(ofs % height);
212 int c = (int)(ofs % channels);
213 int n = (int)(ofs / channels);
214 int ystart = y0 * stride_h - pad_h;
215 int yend = min(ystart + kernel_h, inp_height + pad_h);
216 int ydelta = yend - ystart;
217 ystart = max(ystart, 0);
218 yend = min(yend, inp_height);
219 const float *srcData = src->ptr<float>(n, c);
220 float *dstData = dst->ptr<float>(n, c, y0);
221 float *dstMaskData = mask->data ? mask->ptr<float>(n, c, y0) : 0;
223 int delta = std::min((int)(stripeEnd - ofs0), width - x0);
227 if( poolingType == PoolingLayer::MAX )
228 for( ; x0 < x1; x0++ )
230 int xstart = x0 * stride_w - pad_w;
231 int xend = min(xstart + kernel_w, inp_width);
232 xstart = max(xstart, 0);
235 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
239 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
240 v_float32x4 max_val1 = max_val0;
241 v_float32x4 max_idx0 = v_setall_f32(-1.f);
242 v_float32x4 max_idx1 = max_idx0;
243 int index0 = ystart * inp_width + xstart;
244 v_float32x4 idx0 = idx00 + v_setall_f32((float)index0);
245 v_float32x4 idx1 = idx0 + v_setall_f32((float)(stride_w*4));
247 for (int y = ystart; y < yend; ++y)
249 for (int x = xstart; x < xend; ++x, idx0 += ones, idx1 += ones)
251 const int index = y * inp_width + x;
252 v_float32x4 v0(srcData[index], srcData[index + stride_w],
253 srcData[index + stride_w*2], srcData[index + stride_w*3]);
254 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
255 srcData[index + stride_w*6], srcData[index + stride_w*7]);
256 max_idx0 = v_select(v0 > max_val0, idx0, max_idx0);
257 max_idx1 = v_select(v1 > max_val1, idx1, max_idx1);
258 max_val0 = v_max(max_val0, v0);
259 max_val1 = v_max(max_val1, v1);
264 v_store(dstData + x0, max_val0);
265 v_store(dstData + x0 + 4, max_val1);
268 v_store(dstMaskData + x0, max_idx0);
269 v_store(dstMaskData + x0 + 4, max_idx1);
275 v_float32x4 max_val0 = v_setall_f32(-FLT_MAX);
276 v_float32x4 max_val1 = max_val0;
278 if( yend - ystart == kernel_h )
280 const float* srcData1 = srcData + ystart*inp_width + xstart;
282 for (int k = 0; k < kernel_w*kernel_h; k++)
284 int index = ofsptr[k];
285 v_float32x4 v0 = v_load(srcData1 + index);
286 v_float32x4 v1 = v_load(srcData1 + index + 4);
287 max_val0 = v_max(max_val0, v0);
288 max_val1 = v_max(max_val1, v1);
291 else if( stride_w == 2 )
292 for (int k = 0; k < kernel_w*kernel_h; k++)
294 int index = ofsptr[k];
295 v_float32x4 v00 = v_load(srcData1 + index), v01 = v_load(srcData1 + index + 4);
296 v_float32x4 v0(_mm_shuffle_ps(v00.val, v01.val, _MM_SHUFFLE(2, 0, 2, 0)));
297 v_float32x4 v10 = v_load(srcData1 + index + 8), v11 = v_load(srcData1 + index + 12);
298 v_float32x4 v1(_mm_shuffle_ps(v10.val, v11.val, _MM_SHUFFLE(2, 0, 2, 0)));
299 max_val0 = v_max(max_val0, v0);
300 max_val1 = v_max(max_val1, v1);
304 for (int k = 0; k < kernel_w*kernel_h; k++)
306 int index = ofsptr[k];
307 v_float32x4 v0(srcData1[index], srcData1[index + stride_w],
308 srcData1[index + stride_w*2], srcData1[index + stride_w*3]);
309 v_float32x4 v1(srcData1[index + stride_w*4], srcData1[index + stride_w*5],
310 srcData1[index + stride_w*6], srcData1[index + stride_w*7]);
311 max_val0 = v_max(max_val0, v0);
312 max_val1 = v_max(max_val1, v1);
317 for (int y = ystart; y < yend; ++y)
319 for (int x = xstart; x < xend; ++x)
321 const int index = y * inp_width + x;
322 v_float32x4 v0(srcData[index], srcData[index + stride_w],
323 srcData[index + stride_w*2], srcData[index + stride_w*3]);
324 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
325 srcData[index + stride_w*6], srcData[index + stride_w*7]);
326 max_val0 = v_max(max_val0, v0);
327 max_val1 = v_max(max_val1, v1);
331 v_store(dstData + x0, max_val0);
332 v_store(dstData + x0 + 4, max_val1);
339 float max_val = -FLT_MAX;
343 for (int y = ystart; y < yend; ++y)
344 for (int x = xstart; x < xend; ++x)
346 const int index = y * inp_width + x;
347 float val = srcData[index];
355 dstData[x0] = max_val;
357 dstMaskData[x0] = max_index;
361 for (int y = ystart; y < yend; ++y)
362 for (int x = xstart; x < xend; ++x)
364 const int index = y * inp_width + x;
365 float val = srcData[index];
366 max_val = std::max(max_val, val);
369 dstData[x0] = max_val;
375 for( ; x0 < x1; x0++ )
377 int xstart = x0 * stride_w - pad_w;
378 int xend = min(xstart + kernel_w, inp_width + pad_w);
379 int xdelta = xend - xstart;
380 xstart = max(xstart, 0);
381 xend = min(xend, inp_width);
382 float inv_kernel_area = 1.f/(ydelta*xdelta);
385 if( xstart > 0 && x0 + 7 < x1 && (x0 + 7) * stride_w - pad_w + kernel_w < inp_width )
387 v_float32x4 sum_val0 = v_setzero_f32(), sum_val1 = v_setzero_f32();
388 v_float32x4 ikarea = v_setall_f32(inv_kernel_area);
390 for (int y = ystart; y < yend; ++y)
392 for (int x = xstart; x < xend; ++x)
394 const int index = y * inp_width + x;
395 v_float32x4 v0(srcData[index], srcData[index + stride_w],
396 srcData[index + stride_w*2], srcData[index + stride_w*3]);
397 v_float32x4 v1(srcData[index + stride_w*4], srcData[index + stride_w*5],
398 srcData[index + stride_w*6], srcData[index + stride_w*7]);
403 v_store(dstData + x0, sum_val0*ikarea);
404 v_store(dstData + x0 + 4, sum_val1*ikarea);
411 for (int y = ystart; y < yend; ++y)
412 for (int x = xstart; x < xend; ++x)
414 const int index = y * inp_width + x;
415 float val = srcData[index];
419 dstData[x0] = sum_val*inv_kernel_area;
427 void maxPooling(Mat &src, Mat &dst, Mat &mask)
429 const int nstripes = getNumThreads();
430 PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
433 void avePooling(Mat &src, Mat &dst)
435 const int nstripes = getNumThreads();
437 PoolingInvoker::run(src, dst, mask, kernel, stride, pad, type, computeMaxIdx, nstripes);
440 virtual Ptr<BackendNode> initMaxPoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
443 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
444 const int inWidth = inputBuffer.width();
445 const int inHeight = inputBuffer.height();
447 Halide::Var x("x"), y("y"), c("c"), n("n");
448 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
449 Halide::RDom r(0, kernel.width, 0, kernel.height);
451 if (pad.width || pad.height)
453 kx = clamp(x * stride.width + r.x - pad.width, 0, inWidth - 1);
454 ky = clamp(y * stride.height + r.y - pad.height, 0, inHeight - 1);
458 kx = min(x * stride.width + r.x, inWidth - 1);
459 ky = min(y * stride.height + r.y, inHeight - 1);
462 // Halide::argmax returns tuple (r.x, r.y, max).
463 Halide::Tuple res = argmax(inputBuffer(kx, ky, c, n));
465 // Compute offset from argmax in range [0, kernel_size).
466 Halide::Expr max_index;
467 if (pad.width || pad.height)
469 max_index = clamp(y * stride.height + res[1] - pad.height,
470 0, inHeight - 1) * inWidth +
471 clamp(x * stride.width + res[0] - pad.width,
476 max_index = min(y * stride.height + res[1], inHeight - 1) * inWidth +
477 min(x * stride.width + res[0], inWidth - 1);
479 top(x, y, c, n) = { res[2], Halide::cast<float>(max_index) };
480 return Ptr<BackendNode>(new HalideBackendNode(top));
481 #endif // HAVE_HALIDE
482 return Ptr<BackendNode>();
485 virtual Ptr<BackendNode> initAvePoolingHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
488 Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
490 const int inW = inputBuffer.width(), inH = inputBuffer.height();
491 if ((inW - kernel.width) % stride.width || (inH - kernel.height) % stride.height)
493 CV_Error(cv::Error::StsNotImplemented,
494 "Halide backend for average pooling with partial "
495 "kernels is not implemented");
498 const float norm = 1.0f / (kernel.width * kernel.height);
500 Halide::Var x("x"), y("y"), c("c"), n("n");
501 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
502 Halide::RDom r(0, kernel.width, 0, kernel.height);
503 top(x, y, c, n) = sum(
504 inputBuffer(x * stride.width + r.x,
505 y * stride.height + r.y, c, n)) * norm;
506 return Ptr<BackendNode>(new HalideBackendNode(top));
507 #endif // HAVE_HALIDE
508 return Ptr<BackendNode>();
511 virtual void applyHalideScheduler(Ptr<BackendNode>& node,
512 const std::vector<Mat*> &inputs,
513 const std::vector<Mat> &outputs,
517 if (targetId != DNN_TARGET_CPU)
519 Layer::applyHalideScheduler(node, inputs, outputs, targetId);
522 Halide::Var x("x"), y("y"), c("c"), n("n"), tile("tile"),
523 xi("xi"), yi("yi"), ci("ci"), xo("xo"), yo("yo"), co("co");
524 Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
526 int outW, outH, outC, outN;
527 getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
529 if (outW < 8 || outH < 8)
532 top.split(c, co, ci, 8)
533 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
538 top.fuse(y, c, tile).fuse(n, tile, tile)
547 top.split(x, xo, xi, 8).split(y, yo, yi, 8).split(c, co, ci, 8)
548 .fuse(xo, yo, tile).fuse(co, tile, tile).fuse(n, tile, tile)
552 top.split(x, xo, xi, 8).split(y, yo, yi, 8)
553 .fuse(xo, yo, tile).fuse(c, tile, tile).fuse(n, tile, tile)
557 #endif // HAVE_HALIDE
560 bool getMemoryShapes(const std::vector<MatShape> &inputs,
561 const int requiredOutputs,
562 std::vector<MatShape> &outputs,
563 std::vector<MatShape> &internals) const
565 CV_Assert(inputs.size() != 0);
566 Size in(inputs[0][3], inputs[0][2]), out;
573 else if (padMode.empty())
575 //Yeah, something strange Caffe scheme-)
576 out.height = static_cast<int>(ceil(static_cast<float>(in.height + 2 * pad.height -
577 kernel.height) / stride.height)) + 1;
578 out.width = static_cast<int>(ceil(static_cast<float>(in.width + 2 * pad.width -
579 kernel.width) / stride.width)) + 1;
581 if (pad.height || pad.width)
583 // If we have padding, ensure that the last pooling starts strictly
584 // inside the image (instead of at the padding); otherwise clip the last.
585 if ((out.height - 1) * stride.height >= in.height + pad.height)
587 if ((out.width - 1) * stride.width >= in.width + pad.width)
589 CV_Assert((out.height - 1) * stride.height < in.height + pad.height);
590 CV_Assert((out.width - 1) * stride.width < in.width + pad.width);
595 getConvPoolOutParams(in, kernel, stride,
599 outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
600 for (size_t i = 0; i < inputs.size(); i++)
602 size_t index = type == MAX ? 2*i : i;
603 int dims[] = {inputs[i][0], inputs[i][1], out.height, out.width};
604 outputs[index] = shape(dims);
607 outputs[index + 1] = shape(dims);
613 virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
614 const std::vector<MatShape> &outputs) const
616 (void)inputs; // suppress unused variable warning
619 for(int i = 0; i < outputs.size(); i++)
624 flops += total(outputs[i])*kernel.area();
628 flops += total(outputs[i])*(kernel.area() + 1);
635 Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
637 return Ptr<PoolingLayer>(new PoolingLayerImpl(params));