modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2017, Intel Corporation, all rights reserved.
  14 // Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #ifndef _OPENCV_LIBDNN_HPP_
  44 #define _OPENCV_LIBDNN_HPP_
  45 #include "../../precomp.hpp"
  46 #include <iomanip>
  47 #include <map>
  48 #include <memory>
  49 #include <string>
  50 #include <vector>
  51 #include "common.hpp"
  52
  53 namespace cv { namespace dnn { namespace ocl4dnn {
  54 #ifdef HAVE_OPENCL
  55
  56 struct OCL4DNNConvConfig
  57 {
  58     OCL4DNNConvConfig() :
  59         kernel(1, 1),
  60         pad(0, 0),
  61         stride(1, 1),
  62         dilation(1, 1),
  63         group(1),
  64         bias_term(false)
  65     {}
  66     MatShape in_shape;
  67     MatShape out_shape;
  68     Size kernel;
  69     Size pad;
  70     Size stride;
  71     Size dilation;
  72     int group; // = 1;
  73     bool bias_term; // = false;
  74 };
  75
  76
  77 template<typename Dtype>
  78 class OCL4DNNConvSpatial
  79 {
  80     public:
  81         explicit OCL4DNNConvSpatial(OCL4DNNConvConfig config);
  82         ~OCL4DNNConvSpatial();
  83         bool Forward(const UMat& bottom_data, const UMat& weight,
  84                      const UMat& bias,
  85                      UMat& top_data, int32_t batch_size);
  86
  87     private:
  88         struct kernelConfig
  89         {
  90             std::string kernelName;
  91             float executionTime;
  92             size_t local_work_size[3];
  93             size_t global_work_size[3];
  94             int32_t workItem_output[3];
  95             bool verified;
  96             bool tested;
  97             bool swizzle_weights;
  98             bool use_null_local;
  99             int32_t kernelType;
 100
 101             kernelConfig()
 102             {}
 103
 104             kernelConfig(const std::string& name, const size_t* global_size, const size_t* local_size,
 105                          const int32_t* workItem,
 106                          bool swizzle,
 107                          int32_t type = 0)
 108                 : executionTime(0)
 109             {
 110                 kernelName = name;
 111                 for (int32_t x = 0; x < 3; x++)
 112                 {
 113                     local_work_size[x] = local_size ? local_size[x] : 1;
 114                     global_work_size[x] = global_size[x];
 115                     workItem_output[x] = workItem[x];
 116                 }
 117                 swizzle_weights = swizzle;
 118                 use_null_local = local_size == NULL;
 119                 verified = false;
 120                 tested = false;
 121                 kernelType = type;
 122             }
 123         };
 124
 125         struct tunerParam
 126         {
 127            int kernelType;
 128            int blockWidth;
 129            int blockHeight;
 130            int blockDepth;
 131
 132            tunerParam(int type, int w, int h, int d)
 133            {
 134                kernelType = type;
 135                blockWidth = w;
 136                blockHeight= h;
 137                blockDepth = d;
 138            }
 139         };
 140
 141         inline void addDef(const char* name)
 142         {
 143             options_ << " -D " << name;
 144         }
 145
 146         inline void addDef(const char* name, const int value)
 147         {
 148             options_ << " -D " << name << "=" << value;
 149         }
 150
 151         inline void addDef(const char* name, const float value)
 152         {
 153             options_ << " -D " << name << "=(float)" << value;
 154         }
 155
 156         inline void addDef(const char* name, const double value)
 157         {
 158             options_ << " -D " << name << "=(double)" << value;
 159         }
 160
 161         inline void addDef(const char* name, const char* value)
 162         {
 163             options_ << " -D " << name << "=" << value;
 164         }
 165
 166         void useFirstAvailable(const UMat &bottom,
 167                                UMat &top,
 168                                const UMat &weight,
 169                                const UMat &bias,
 170                                int32_t numImages,
 171                                UMat &verifyTop);
 172         void setupKernel();
 173         void collectCommonInformation();
 174         void setupKernelDetails(int32_t kernelType,
 175                                 int32_t blockM,
 176                                 int32_t blockK,
 177                                 int32_t blockN);
 178
 179         ocl::Program compileKernel();
 180         typedef std::map<std::string, ocl::Program> phash_t;
 181         phash_t phash;
 182         void calculateBenchmark(const UMat &bottom, UMat &verifyTop,
 183                                 const UMat &weight, const UMat &bias,
 184                                 int32_t numImages);
 185
 186
 187         void setupConvolution(const UMat &bottom,
 188                               UMat &top,
 189                               const UMat &weight,
 190                               const UMat &bias,
 191                               int32_t numImags,
 192                               UMat &verifyTop);
 193         bool createConvolutionKernel(int32_t kernelType,
 194                                      int32_t blockWidth,
 195                                      int32_t blockHeight,
 196                                      int32_t blockDepth);
 197         bool setupIDLF(int32_t blockWidth,
 198                        int32_t blockHeight,
 199                        int32_t blockDepth);
 200         bool createBasicKernel(int32_t blockWidth,
 201                                int32_t blockHeight,
 202                                int32_t blockDepth);
 203         bool createGEMMLikeConvKernel(int32_t blockWidth,
 204                                       int32_t blockHeight,
 205                                       int32_t blockDepth);
 206         void CreateSubBuffer(const UMat& buffer, UMat& sub_buffer,
 207                              int32_t offset, int32_t size, bool write_only);
 208         bool convolve(const UMat &bottom, UMat &top,
 209                       const UMat &weight, const UMat &bias,
 210                       int32_t numImages,
 211                       kernelConfig* config,
 212                       const cv::ocl::Queue& queue);
 213         float timedConvolve(const UMat &bottom, UMat &top,
 214                             const UMat &weight, const UMat &bias,
 215                             int32_t numImages, kernelConfig* config);
 216
 217         bool verifyResult(const UMat &bottom,
 218                           UMat &top,
 219                           const UMat &weight,
 220                           const UMat &bias,
 221                           int32_t numImages,
 222                           kernelConfig* config,
 223                           UMat &verifyTop);
 224
 225         bool swizzleWeight(const UMat &weight,
 226                            int32_t swizzled_factor,
 227                            bool interleave = false);
 228
 229         void generateKey();
 230         std::string generateSpecificKey(int32_t type, int32_t blockWidth,
 231                                           int32_t blockHeight,
 232                                           int32_t blockDepth);
 233         void cacheTunedConfig();
 234         bool loadTunedConfig();
 235
 236         void saveTunedConfig();
 237         bool loadCachedConfig();
 238
 239         void unloadProgram(const std::string& kernelName);
 240         void prepareKernel(const UMat &bottom, UMat &top,
 241                            const UMat &weight, const UMat &bias,
 242                            int32_t numImages);
 243         bool setupKernelByConfig(int x, int y, int z, int type,
 244                                  int lx, int ly, int lz,
 245                                  bool swizzle, bool nullLocal);
 246         void generateTunerItems(std::vector< cv::Ptr<tunerParam> > &tunerItems);
 247
 248         int32_t group_;
 249         bool bias_term_;
 250         UMat swizzled_weights_umat;
 251
 252         int32_t bottom_index_;
 253         int32_t output_h_;
 254         int32_t output_w_;
 255         int32_t kernel_h_;
 256         int32_t kernel_w_;
 257         int32_t height_;
 258         int32_t width_;
 259         int32_t pad_h_;
 260         int32_t pad_w_;
 261         int32_t stride_h_;
 262         int32_t stride_w_;
 263         int32_t dilation_h_;
 264         int32_t dilation_w_;
 265
 266         /// M_ is the channel dimension of the output for a single group, which is the
 267         /// leading dimension of the filter matrix.
 268         int32_t M_;
 269
 270         bool tuned_;
 271         std::string key_, key_sanitized_;
 272         std::string short_key_;
 273         std::string kernel_name_;
 274         std::string cache_path_;
 275         bool use_cache_path_; // true if cache_path_ directory exists
 276         bool force_auto_tuning_;
 277         int32_t kernel_index_;
 278         std::vector< cv::Ptr<kernelConfig> > kernelQueue;
 279         cv::Ptr<kernelConfig> bestKernelConfig;
 280
 281         int32_t bottom_dim_;
 282         int32_t top_dim_;
 283         int32_t num_;
 284         int32_t channels_;
 285         int32_t num_output_;
 286
 287         int32_t kernelType_;
 288         int32_t blockM_;
 289         int32_t blockK_;
 290         int32_t blockN_;
 291         std::stringstream options_;
 292         cv::ocl::ProgramSource src_;
 293         int32_t prev_kernel_type_;
 294 };
 295
 296 typedef enum {
 297     LIBDNN_POOLING_METHOD_MAX                 = 0,
 298     LIBDNN_POOLING_METHOD_AVE                 = 1,
 299     LIBDNN_POOLING_METHOD_STO                 = 2
 300 } ocl4dnnPoolingMethod_t;
 301
 302 struct OCL4DNNPoolConfig
 303 {
 304     OCL4DNNPoolConfig() :
 305         kernel(1, 1),
 306         pad(0, 0),
 307         stride(1, 1),
 308         dilation(1, 1),
 309         channels(0),
 310         pool_method(LIBDNN_POOLING_METHOD_MAX),
 311         global_pooling(false)
 312     {}
 313     MatShape in_shape;
 314     MatShape out_shape;
 315     Size kernel;
 316     Size pad;
 317     Size stride;
 318     Size dilation;
 319
 320     int channels;
 321     ocl4dnnPoolingMethod_t pool_method; // = LIBDNN_POOLING_METHOD_MAX;
 322     bool global_pooling; // = false;
 323 };
 324
 325 template<typename Dtype>
 326 class OCL4DNNPool
 327 {
 328     public:
 329         explicit OCL4DNNPool(OCL4DNNPoolConfig config);
 330         ~OCL4DNNPool();
 331         bool Forward(const UMat& bottom_data,
 332                      UMat& top_data,
 333                      UMat& top_mask);
 334     private:
 335         UMat mask_idx_;
 336
 337         // Pooling parameters
 338         std::vector<int32_t> pad_;
 339         std::vector<int32_t> stride_;
 340         std::vector<int32_t> kernel_shape_;
 341         std::vector<int32_t> im_in_shape_;
 342         std::vector<int32_t> im_out_shape_;
 343
 344         ocl4dnnPoolingMethod_t pool_method_;
 345         int32_t count_;
 346         int32_t batch_size_;
 347         int32_t channels_;
 348         int32_t kernel_h_;
 349         int32_t kernel_w_;
 350         int32_t stride_h_;
 351         int32_t stride_w_;
 352         int32_t pad_h_;
 353         int32_t pad_w_;
 354         int32_t height_;
 355         int32_t width_;
 356         int32_t pooled_height_;
 357         int32_t pooled_width_;
 358 };
 359
 360 struct OCL4DNNInnerProductConfig
 361 {
 362     OCL4DNNInnerProductConfig() :
 363         num_output(0), M(0), K(0),
 364         bias_term(false), transpose(false), phase_test(true)
 365     {}
 366     int num_output;
 367     int M;
 368     int K;
 369     bool bias_term;
 370     bool transpose; // = false;
 371     bool phase_test; // = true;
 372 };
 373
 374 template<typename Dtype>
 375 class OCL4DNNInnerProduct
 376 {
 377     public:
 378         explicit OCL4DNNInnerProduct(OCL4DNNInnerProductConfig config);
 379         ~OCL4DNNInnerProduct();
 380         bool Forward(const UMat& bottom_data,
 381                      const UMat& weight,
 382                      const UMat& bias,
 383                      UMat& top_data);
 384     private:
 385         OCL4DNNInnerProductConfig config_;
 386         int32_t axis_;
 387         int32_t num_output_;
 388         int32_t M_;
 389         int32_t N_;
 390         int32_t K_;
 391         bool bias_term_;
 392         bool transpose_;
 393         bool image_copied_;
 394         bool phase_test_;
 395 };
 396
 397 typedef enum {
 398     LRNParameter_NormRegion_ACROSS_CHANNELS = 0,
 399     LRNParameter_NormRegion_WITHIN_CHANNEL = 1
 400 } LRNParameter_NormRegion_WITHIN_CHANNEL_t;
 401
 402 struct OCL4DNNLRNConfig
 403 {
 404     OCL4DNNLRNConfig() :
 405         lrn_type(LRNParameter_NormRegion_ACROSS_CHANNELS),
 406         phase_test(true),
 407         local_size(0), alpha(0.f), beta(0.f), k(0.f), norm_by_size(false),
 408         batch_size(0), channels(0), height(0), width(0)
 409     {}
 410     MatShape in_shape;
 411     LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type;
 412     bool phase_test; // = true;
 413     int local_size;
 414     float alpha;
 415     float beta;
 416     float k;
 417     bool norm_by_size;
 418     int32_t batch_size;
 419     int32_t channels;
 420     int32_t height;
 421     int32_t width;
 422 };
 423
 424 template<typename Dtype>
 425 class OCL4DNNLRN
 426 {
 427     public:
 428         explicit OCL4DNNLRN(OCL4DNNLRNConfig config);
 429         bool Forward(const UMat& bottom_data, UMat& top_data);
 430
 431     private:
 432         bool crossChannelForward(const UMat& bottom_data, UMat& top_data);
 433         LRNParameter_NormRegion_WITHIN_CHANNEL_t lrn_type_;
 434         bool phase_test_;
 435         int32_t size_;
 436         Dtype alpha_;
 437         Dtype beta_;
 438         Dtype k_;
 439         int32_t num_;
 440         int32_t channels_;
 441         int32_t height_;
 442         int32_t width_;
 443         bool norm_by_size_;
 444 };
 445
 446 struct OCL4DNNSoftmaxConfig
 447 {
 448     OCL4DNNSoftmaxConfig() : axis(0), channels(0), logsoftmax(false)
 449     {}
 450     MatShape in_shape;
 451     int axis;
 452     int channels;
 453     bool logsoftmax;
 454 };
 455
 456 template<typename Dtype>
 457 class OCL4DNNSoftmax
 458 {
 459     public:
 460         explicit OCL4DNNSoftmax(OCL4DNNSoftmaxConfig config);
 461         ~OCL4DNNSoftmax();
 462         bool Forward(const UMat& bottom_data, UMat& top_data);
 463
 464     private:
 465         int32_t softmax_axis_;
 466         int32_t inner_num_;
 467         int32_t outer_num_;
 468         int32_t channels_;
 469         int32_t count_;
 470         bool use_slm_;
 471         bool log_softmax_;
 472         UMat scale_data_;
 473 };
 474 #endif // HAVE_OPENCL
 475 } // namespace ocl4dnn
 476 } // namespace dnn
 477 } // namespce cv
 478 #endif