compute/cker/include/cker/operation/DepthwiseConv.h

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_DEPTHWISE_CONV_H__
  19 #define __NNFW_CKER_DEPTHWISE_CONV_H__
  20
  21 #include "cker/Shape.h"
  22 #include "cker/Types.h"
  23 #include "cker/Utils.h"
  24 #include "cker/neon/neon_check.h"
  25 #include "cker/operation/optimized/DepthwiseConvFloat.h"
  26 #include "cker/operation/optimized/DepthwiseConvUint8.h"
  27 #include "cker/operation/optimized/integer_ops/DepthwiseConvInt8.h"
  28 #include "cker/operation/reference/integer_ops/DepthwiseConvUInt8.h"
  29 #include "cker/operation/reference/integer_ops/DepthwiseConvHybrid.h"
  30 #include "cker/CpuBackendThreadpool.h"
  31
  32 namespace nnfw
  33 {
  34 namespace cker
  35 {
  36
  37 // TODO(luwa): add multithread to per-channel depthwise_conv
  38 // DepthwiseConv can run with multi threads on the dim specified by thread_dim.
  39 // Each thread processes output elements on dim, thread_dim, in the range of
  40 // [thread_start, thread_end).
  41 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
  42 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
  43 template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
  44 {
  45   DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const Shape &input_shape,
  46                           const T *input_data, const Shape &filter_shape, const T *filter_data,
  47                           const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
  48                           T *output_data, int thread_start, int thread_end, int thread_dim)
  49     : params_(params), input_shape_(input_shape), input_data_(input_data),
  50       filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape),
  51       bias_data_(bias_data), output_shape_(output_shape), output_data_(output_data),
  52       thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim)
  53   {
  54   }
  55
  56   void Run() override
  57   {
  58     optimized::DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_, filter_data_,
  59                                  bias_shape_, bias_data_, output_shape_, output_data_,
  60                                  thread_start_, thread_end_, thread_dim_);
  61   }
  62
  63 private:
  64   const DepthwiseConvParams &params_;
  65   const Shape &input_shape_;
  66   const T *input_data_;
  67   const Shape &filter_shape_;
  68   const T *filter_data_;
  69   const Shape &bias_shape_;
  70   const TS *bias_data_;
  71   const Shape &output_shape_;
  72   T *output_data_;
  73   // const CpuFlags& cpu_flags_;
  74   int thread_start_;
  75   int thread_end_;
  76   int thread_dim_;
  77 };
  78
  79 inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)
  80 {
  81   // How many scalar multiplications are needed to make it worth using one
  82   // more thread
  83   static constexpr int kMinMulPerThread = 1 << 13; // 8k
  84   const int filter_height = filter_shape.Dims(1);
  85   const int filter_width = filter_shape.Dims(2);
  86   const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
  87   // Try to avoid real runtime divisions if possible by dividing by a
  88   // compile-time constant.
  89   int thread_count = std::max(1, num_muls / kMinMulPerThread);
  90   return thread_count;
  91 }
  92
  93 inline bool MultithreadAlongBatches(int thread_count, int batches)
  94 {
  95   assert(thread_count >= 2);
  96   // If there are fewer batch entries than the number of threads we want to use,
  97   // then better do intra-batch-entry multithreading.
  98   if (batches < thread_count)
  99   {
 100     return false;
 101   }
 102   // If there are at least 2 batch entries to be handed to each thread, then
 103   // it's safe to proceed with batch-wise multithreading: each thread will have
 104   // approximately equal number of batch entries to handle, so the load
 105   // balancing will be reasonable, and the amount to which the load is not
 106   // perfectly balanced will be offset by the inherent advantages of
 107   // batch-wise multithreading (each thread is more efficient thanks to working
 108   // on larger buffers with less boundary-handling overhead).
 109   if (batches >= 2 * thread_count)
 110   {
 111     return true;
 112   }
 113   // In the limit case were there are at least 1 but not much more than 1
 114   // batch entries per thread, it may be a good idea to do per-batch
 115   // multithreading if the number of batch entries is a multiple of the number
 116   // of threads, so that each thread will have the same number of batch entries
 117   // to process.
 118   return ((batches % thread_count) == 0);
 119 }
 120
 121 template <typename T, typename TS>
 122 inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
 123                           const T *input_data, const Shape &filter_shape, const T *filter_data,
 124                           const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
 125                           T *output_data, ruy::Context *ruy_context)
 126 {
 127   assert(input_shape.DimensionsCount() == 4);
 128   assert(filter_shape.DimensionsCount() == 4);
 129   assert(output_shape.DimensionsCount() == 4);
 130
 131   int thread_count = HowManyConvThreads(output_shape, filter_shape);
 132
 133   // NOTE Borrow RuyContext to get max_num_threads setting
 134   // TODO Define and use max_num_threads for CPU backend
 135   const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads();
 136
 137   thread_count = std::max(1, std::min(thread_count, max_threads));
 138   // Cap the number of threads to 2 for float path to avoid regression in
 139   // performance (b/132294857).
 140   if (std::is_floating_point<T>::value)
 141   {
 142     thread_count = std::min(thread_count, 2);
 143   }
 144
 145   const int output_batches = output_shape.Dims(0);
 146   const int output_height = output_shape.Dims(1);
 147
 148   if (thread_count == 1)
 149   {
 150     optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
 151                                  bias_shape, bias_data, output_shape, output_data, 0, output_height,
 152                                  1);
 153     return;
 154   }
 155
 156   int thread_dim, thread_dim_size;
 157   if (MultithreadAlongBatches(thread_count, output_batches))
 158   {
 159     thread_dim = 0;
 160     thread_dim_size = output_batches;
 161   }
 162   else
 163   {
 164     thread_dim = 1;
 165     thread_dim_size = output_height;
 166   }
 167
 168   std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
 169   // TODO(b/131746020) don't create new heap allocations every time.
 170   // At least we make it a single heap allocation by using reserve().
 171   tasks.reserve(thread_count);
 172   int thread_start = 0;
 173   for (int i = 0; i < thread_count; ++i)
 174   {
 175     int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
 176     tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
 177                        bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
 178     thread_start = thread_end;
 179   }
 180   cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
 181 }
 182
 183 } // namespace cker
 184 } // namespace nnfw
 185
 186 #endif // __NNFW_CKER_DEPTHWISE_CONV_H__