modules/dnn/src/layers/lrn_layer.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "../precomp.hpp"
  44 #include "layers_common.hpp"
  45 #include "../op_cuda.hpp"
  46 #include "../op_halide.hpp"
  47 #include "../op_inf_engine.hpp"
  48 #include "../op_vkcom.hpp"
  49 #include "opencv2/imgproc.hpp"
  50 #include "opencv2/dnn/shape_utils.hpp"
  51 #include "opencv2/core/hal/hal.hpp"
  52 #include <algorithm>
  53
  54 #ifdef HAVE_OPENCL
  55 #include "opencl_kernels_dnn.hpp"
  56 using namespace cv::dnn::ocl4dnn;
  57 #endif
  58
  59 #ifdef HAVE_CUDA
  60 #include "../cuda4dnn/primitives/lrn.hpp"
  61 using namespace cv::dnn::cuda4dnn;
  62 #endif
  63
  64 namespace cv
  65 {
  66 namespace dnn
  67 {
  68
  69 class LRNLayerImpl CV_FINAL : public LRNLayer
  70 {
  71 public:
  72     LRNLayerImpl(const LayerParams& params)
  73     {
  74         setParamsFrom(params);
  75         type = -1;
  76         String nrmType = params.get<String>("norm_region", "ACROSS_CHANNELS");
  77         if (nrmType == "ACROSS_CHANNELS")
  78             type = CHANNEL_NRM;
  79         else if (nrmType == "WITHIN_CHANNEL")
  80             type = SPATIAL_NRM;
  81         else
  82             CV_Error(Error::StsBadArg, "Unknown region type \"" + nrmType + "\"");
  83
  84         size = params.get<int>("local_size", 5);
  85         if (size % 2 != 1 || size <= 0)
  86             CV_Error(Error::StsBadArg, "LRN layer supports only positive odd values for local_size");
  87
  88         alpha = params.get<double>("alpha", 1);
  89         beta = params.get<double>("beta", 0.75);
  90         bias = params.get<double>("bias", 1);
  91         normBySize = params.get<bool>("norm_by_size", true);
  92     }
  93
  94 #ifdef HAVE_OPENCL
  95     Ptr<OCL4DNNLRN<float> > lrnOp;
  96 #endif
  97
  98     virtual bool supportBackend(int backendId) CV_OVERRIDE
  99     {
 100         if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
 101             return bias == (int)bias;
 102         return backendId == DNN_BACKEND_OPENCV ||
 103                backendId == DNN_BACKEND_CUDA ||
 104                backendId == DNN_BACKEND_HALIDE ||
 105                (backendId == DNN_BACKEND_VKCOM && haveVulkan() && (size % 2 == 1) && (type == CHANNEL_NRM));
 106     }
 107
 108 #ifdef HAVE_OPENCL
 109     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays) CV_OVERRIDE
 110     {
 111         lrnOp.release();
 112     }
 113
 114     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
 115     {
 116         std::vector<UMat> inputs;
 117         std::vector<UMat> outputs;
 118
 119         bool use_half = (inps.depth() == CV_16S);
 120         inps.getUMatVector(inputs);
 121         outs.getUMatVector(outputs);
 122
 123         if (lrnOp.empty())
 124         {
 125             OCL4DNNLRNConfig config;
 126             config.lrn_type = type == CHANNEL_NRM ?
 127                               LRNParameter_NormRegion_ACROSS_CHANNELS :
 128                               LRNParameter_NormRegion_WITHIN_CHANNEL;
 129
 130             CHECK_EQ(size % 2, 1)<< "LRN only supports odd values for local_size";
 131             config.local_size = size;
 132             config.alpha = alpha;
 133             config.beta = beta;
 134             config.k = bias;
 135             CHECK_EQ(4, inputs[0].dims) << "Input must have 4 axes, "
 136                      << "corresponding to (num, channels, height, width)";
 137             config.batch_size = inputs[0].size[0];
 138             config.channels = inputs[0].size[1];
 139             config.height = inputs[0].size[2];
 140             config.width = inputs[0].size[3];
 141             config.norm_by_size = normBySize;
 142             config.use_half = use_half;
 143
 144             lrnOp = Ptr<OCL4DNNLRN<float> >(new OCL4DNNLRN<float>(config));
 145         }
 146
 147         if (!lrnOp->Forward(inputs[0], outputs[0]))
 148             return false;
 149
 150         return true;
 151     }
 152 #endif
 153
 154     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
 155     {
 156         CV_TRACE_FUNCTION();
 157         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 158
 159         CV_Assert(inputs_arr.total() == outputs_arr.total());
 160
 161         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
 162                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 163
 164         if (inputs_arr.depth() == CV_16S)
 165         {
 166             forward_fallback(inputs_arr, outputs_arr, internals_arr);
 167             return;
 168         }
 169
 170         std::vector<Mat> inputs, outputs;
 171         inputs_arr.getMatVector(inputs);
 172         outputs_arr.getMatVector(outputs);
 173
 174         CV_Assert(inputs.size() == outputs.size());
 175
 176         for (int i = 0; i < inputs.size(); i++)
 177         {
 178             CV_Assert(inputs[i].dims == 4);
 179
 180             Mat &src = inputs[i];
 181             Mat &dst = outputs[i];
 182
 183             switch (type)
 184             {
 185                 case CHANNEL_NRM:
 186                     channelNormalization(src, dst);
 187                     break;
 188                 case SPATIAL_NRM:
 189                     spatialNormalization(src, dst);
 190                     break;
 191                 default:
 192                     CV_Error(Error::StsNotImplemented, "Unimplemented mode of LRN layer");
 193                     break;
 194             }
 195         }
 196     }
 197
 198     class ChannelLRN : public ParallelLoopBody
 199     {
 200     public:
 201         ChannelLRN(const float* src, float* dst, int channels, int ksize,
 202                    float alpha1, float bias1, float beta1,
 203                    size_t planeSize, int nsamples, int nstripes)
 204         {
 205             src_ = src; dst_ = dst;
 206             channels_ = channels;
 207             ksize_ = ksize;
 208             alpha1_ = alpha1; bias1_ = bias1; beta1_ = beta1;
 209             planeSize_ = planeSize; nsamples_ = nsamples; nstripes_ = nstripes;
 210         }
 211
 212         void operator()(const Range& r) const CV_OVERRIDE
 213         {
 214             int nsamples = nsamples_, nstripes = nstripes_;
 215             size_t planeSize = planeSize_, planeSize_n = planeSize * nsamples;
 216             size_t elemsPerStripe = (planeSize_n + nstripes - 1)/nstripes;
 217             size_t rstart = r.start*elemsPerStripe;
 218             size_t rend = r.end == nstripes ? planeSize_n : r.end*elemsPerStripe;
 219             rstart = std::min(rstart, planeSize_n);
 220             rend = std::min(rend, planeSize_n);
 221             float alpha1 = alpha1_, bias1 = bias1_, beta1 = beta1_;
 222             int k, channels = channels_, ksize = ksize_;
 223
 224             AutoBuffer<float> buf_((channels + ksize + 1)*2);
 225             float* acc = buf_.data();
 226             float* buf = acc + channels + ksize + 1;
 227             for( k = 0; k <= ksize; k++ )
 228                 buf[-k-1] = buf[channels + k] = 0.f;
 229
 230             for( size_t ofs = rstart; ofs < rend; )
 231             {
 232                 int sampleIdx = (int)(ofs/planeSize);
 233                 if( sampleIdx >= nsamples )
 234                     break;
 235                 size_t ofs0 = ofs - sampleIdx*planeSize;
 236                 size_t ofs1 = std::min(planeSize - ofs0, rend - ofs) + ofs;
 237                 const float* src = src_ + sampleIdx*planeSize*channels + ofs0;
 238                 float* dst = dst_ + sampleIdx*planeSize*channels + ofs0;
 239
 240                 for( ; ofs < ofs1; ofs++, src++, dst++ )
 241                 {
 242                     for( k = 0; k < channels; k++ )
 243                         buf[k] = src[k*planeSize];
 244                     float s = 0;
 245                     for( k = 0; k < ksize; k++ )
 246                         s += buf[k]*buf[k];
 247                     for( k = 0; k < channels; k++ )
 248                     {
 249                         float x1 = buf[k + ksize];
 250                         float x0 = buf[k - ksize - 1];
 251                         s = std::max(s + (x1 + x0)*(x1 - x0), 0.f);
 252                         acc[k] = (float)(alpha1*s + bias1);
 253                     }
 254
 255                     hal::log32f(acc, acc, channels);
 256                     for( k = 0; k < channels; k++ )
 257                         acc[k] *= beta1;
 258                     hal::exp32f(acc, acc, channels);
 259
 260                     for( k = 0; k < channels; k++ )
 261                         dst[k*planeSize] = buf[k]*acc[k];
 262                 }
 263             }
 264         }
 265
 266         const float* src_;
 267         float* dst_;
 268         float alpha1_, bias1_, beta1_;
 269         size_t planeSize_;
 270         int channels_, ksize_, nsamples_, nstripes_;
 271     };
 272
 273     void channelNormalization(Mat &srcBlob, Mat &dstBlob)
 274     {
 275         int num = srcBlob.size[0];
 276         int channels = srcBlob.size[1];
 277         int ksize = (size - 1) / 2;
 278         int sizeNormFactor = normBySize ? size : 1;
 279         size_t planeSize = srcBlob.size[2]*srcBlob.size[3];
 280
 281         int nstripes = std::max(getNumThreads(), 1);
 282
 283         ChannelLRN clrn(srcBlob.ptr<float>(), dstBlob.ptr<float>(), channels,
 284                         ksize, alpha/sizeNormFactor, bias, -beta, planeSize, num, nstripes);
 285         parallel_for_(Range(0, nstripes), clrn, nstripes);
 286     }
 287
 288     void sqrBoxFilter_(const Mat &src, Mat &dst)
 289     {
 290         Mat srcRawWrapper(src.rows, src.cols, src.type(), src.data, src.step[0]);
 291         cv::sqrBoxFilter(srcRawWrapper, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT);
 292     }
 293
 294     void spatialNormalization(Mat &srcBlob, Mat &dstBlob)
 295     {
 296         int num = srcBlob.size[0];
 297         int channels = srcBlob.size[1];
 298         int sizeNormFactor = normBySize ? size*size : 1;
 299
 300         Mat srcMat = srcBlob;
 301         Mat dstMat = dstBlob;
 302
 303         for (int n = 0; n < num; n++)
 304         {
 305             for (int cn = 0; cn < channels; cn++)
 306             {
 307                 Mat src = getPlane(srcMat, n, cn);
 308                 Mat dst = getPlane(dstMat, n, cn);
 309
 310                 sqrBoxFilter_(src, dst);
 311
 312                 dst.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
 313                 cv::pow(dst, beta, dst);
 314                 cv::divide(src, dst, dst);
 315             }
 316         }
 317     }
 318
 319 #ifdef HAVE_CUDA
 320     Ptr<BackendNode> initCUDA(
 321         void *context_,
 322         const std::vector<Ptr<BackendWrapper>>& inputs,
 323         const std::vector<Ptr<BackendWrapper>>& outputs
 324     ) override
 325     {
 326         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 327
 328         cuda4dnn::LRNType type_;
 329         if (type == CHANNEL_NRM)
 330             type_ = cuda4dnn::LRNType::ACROSS_CHANNELS;
 331         else if (type == SPATIAL_NRM)
 332             type_ = cuda4dnn::LRNType::WITHIN_CHANNEL;
 333         else
 334             CV_Error(Error::StsNotImplemented, "Unknown normalization region");
 335
 336         float alphaSize = alpha;
 337         if (!normBySize) {
 338             switch (type) {
 339             case CHANNEL_NRM: alphaSize = alpha * size; break;
 340             case SPATIAL_NRM: alphaSize = alpha * size * size; break;
 341             }
 342         }
 343
 344         std::size_t largestInputSize = 0;
 345         for(auto& wrapper : inputs) {
 346             auto input_wrapper = wrapper.dynamicCast<CUDABackendWrapper>();
 347             auto shape = input_wrapper->getShape();
 348             largestInputSize = std::max<std::size_t>(
 349                 largestInputSize,
 350                 std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<int>())
 351             );
 352         }
 353
 354         return make_cuda_node<cuda4dnn::LRNOp>(preferableTarget,
 355             std::move(context->cudnn_handle), type_, size, alphaSize, beta, bias, largestInputSize);
 356     }
 357 #endif
 358
 359     virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
 360     {
 361 #ifdef HAVE_VULKAN
 362         std::shared_ptr<vkcom::OpBase> op(new vkcom::OpLRN(size / 2, bias, alpha, beta, normBySize));
 363         return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
 364 #endif
 365         return Ptr<BackendNode>();
 366     }
 367
 368     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
 369     {
 370 #ifdef HAVE_HALIDE
 371         float alphaSize = alpha;
 372         if (normBySize)
 373             alphaSize /= (type == CHANNEL_NRM ? size : size * size);
 374         int width, height, channels, numImgs;
 375         Halide::Buffer<float> inputBuffer = halideBuffer(inputs[0]);
 376         getCanonicalSize(inputBuffer, &width, &height, &channels, &numImgs);
 377
 378         Halide::Var x("x"), y("y"), c("c"), n("n");
 379         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
 380         Halide::Func padded_sq(name + "_padded_sq");
 381         Halide::Func sq("sq");
 382         sq(x, y, c, n) = inputBuffer(x, y, c, n) * inputBuffer(x, y, c, n);
 383
 384         Halide::Func bounded =
 385             Halide::BoundaryConditions::constant_exterior(sq, 0, 0, width,
 386                                                           0, height,
 387                                                           0, channels,
 388                                                           0, numImgs);
 389         padded_sq(x, y, c, n) = bounded(x, y, c, n);
 390
 391         Halide::Expr base;
 392         if (type == CHANNEL_NRM)
 393         {
 394             Halide::RDom r((1 - size) / 2, size);
 395             base = alphaSize * sum(padded_sq(x, y, c + r, n));
 396         }
 397         else  // SPATIAL_NRM
 398         {
 399             Halide::RDom r((1 - size) / 2, size, (1 - size) / 2, size);
 400             base = alphaSize * sum(padded_sq(x + r.x, y + r.y, c, n));
 401         }
 402         base += static_cast<float>(bias);
 403         top(x, y, c, n) = inputBuffer(x, y, c, n) / pow(base, beta);
 404         return Ptr<BackendNode>(new HalideBackendNode({ padded_sq, top }));
 405 #endif  // HAVE_HALIDE
 406         return Ptr<BackendNode>();
 407     }
 408
 409     virtual void applyHalideScheduler(Ptr<BackendNode>& node,
 410                                       const std::vector<Mat*> &inputs,
 411                                       const std::vector<Mat> &outputs,
 412                                       int targetId) const CV_OVERRIDE
 413     {
 414 #ifdef  HAVE_HALIDE
 415         if (targetId != DNN_TARGET_CPU)
 416         {
 417             Layer::applyHalideScheduler(node, inputs, outputs, targetId);
 418             return;
 419         }
 420         int outW, outH, outC, outN;
 421         getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
 422
 423         Halide::Var x("x"), y("y"), c("c"), n("n"), yo("yo"), yi("yi"), tile("tile");
 424         Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs[1];
 425         Halide::Func& padded_sq = node.dynamicCast<HalideBackendNode>()->funcs[0];
 426
 427         if (outW < 8 || outH <= 2)
 428             return;
 429
 430         top.reorder(x, c, y, n)
 431            .split(y, yo, yi, 2)
 432            .fuse(yo, n, tile)
 433            .parallel(tile)
 434            .unroll(yi)
 435            .vectorize(x, 8);
 436         padded_sq.store_at(top, tile)
 437                  .compute_at(top, yi);
 438 #endif  // HAVE_HALIDE
 439     }
 440
 441 #ifdef HAVE_INF_ENGINE
 442     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
 443     {
 444         float alphaSize = alpha;
 445         if (!normBySize)
 446             alphaSize *= (type == SPATIAL_NRM ? size*size : size);
 447
 448         InferenceEngine::Builder::NormLayer ieLayer(name);
 449         ieLayer.setSize(size);
 450         ieLayer.setAlpha(alphaSize);
 451         ieLayer.setBeta(beta);
 452         ieLayer.setAcrossMaps(type == CHANNEL_NRM);
 453
 454         InferenceEngine::Builder::Layer l = ieLayer;
 455         l.getParameters()["k"] = bias;
 456         return Ptr<BackendNode>(new InfEngineBackendNode(l));
 457     }
 458 #endif  // HAVE_INF_ENGINE
 459
 460     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
 461                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
 462     {
 463         CV_UNUSED(outputs); // suppress unused variable warning
 464         CV_Assert(inputs.size() > 0);
 465         long flops = 0;
 466
 467         for(int i = 0; i < inputs.size(); i++)
 468         {
 469             if (type == CHANNEL_NRM)
 470             {
 471                 int channels = inputs[i][1];
 472                 int ksize = (size - 1) / 2;
 473
 474                 flops += inputs[i][0]*(std::min(ksize, channels)*2*total(inputs[i], 2) + channels*4*total(inputs[i], 2));
 475
 476                 if (ksize < channels)
 477                 {
 478                     flops += (size + 2*(channels - size))*total(inputs[i], 2);
 479                 }
 480             }
 481             else
 482             {
 483                 flops += total(inputs[i])*(2*size*size + 2);
 484             }
 485         }
 486         return flops;
 487     }
 488
 489 private:
 490     enum Type
 491     {
 492         CHANNEL_NRM,
 493         SPATIAL_NRM
 494     };
 495 };
 496
 497 Ptr<LRNLayer> LRNLayer::create(const LayerParams& params)
 498 {
 499     return Ptr<LRNLayer>(new LRNLayerImpl(params));
 500 }
 501
 502 }
 503 }