2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_DEPTHWISE_CONV_H__
19 #define __NNFW_CKER_DEPTHWISE_CONV_H__
21 #include "cker/Shape.h"
22 #include "cker/Types.h"
23 #include "cker/Utils.h"
24 #include "cker/neon/neon_check.h"
25 #include "cker/operation/optimized/DepthwiseConvFloat.h"
26 #include "cker/operation/optimized/DepthwiseConvUint8.h"
27 #include "cker/operation/optimized/integer_ops/DepthwiseConvInt8.h"
28 #include "cker/operation/reference/integer_ops/DepthwiseConvUInt8.h"
29 #include "cker/CpuBackendThreadpool.h"
36 // TODO(luwa): add multithread to per-channel depthwise_conv
37 // DepthwiseConv can run with multi threads on the dim specified by thread_dim.
38 // Each thread processes output elements on dim, thread_dim, in the range of
39 // [thread_start, thread_end).
40 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
41 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
42 template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
44 DepthwiseConvWorkerTask(const DepthwiseConvParams ¶ms, const Shape &input_shape,
45 const T *input_data, const Shape &filter_shape, const T *filter_data,
46 const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
47 T *output_data, int thread_start, int thread_end, int thread_dim)
48 : params_(params), input_shape_(input_shape), input_data_(input_data),
49 filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape),
50 bias_data_(bias_data), output_shape_(output_shape), output_data_(output_data),
51 thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim)
57 optimized::DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_, filter_data_,
58 bias_shape_, bias_data_, output_shape_, output_data_,
59 thread_start_, thread_end_, thread_dim_);
63 const DepthwiseConvParams ¶ms_;
64 const Shape &input_shape_;
66 const Shape &filter_shape_;
67 const T *filter_data_;
68 const Shape &bias_shape_;
70 const Shape &output_shape_;
72 // const CpuFlags& cpu_flags_;
78 inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)
80 // How many scalar multiplications are needed to make it worth using one
82 static constexpr int kMinMulPerThread = 1 << 13; // 8k
83 const int filter_height = filter_shape.Dims(1);
84 const int filter_width = filter_shape.Dims(2);
85 const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
86 // Try to avoid real runtime divisions if possible by dividing by a
87 // compile-time constant.
88 int thread_count = std::max(1, num_muls / kMinMulPerThread);
92 inline bool MultithreadAlongBatches(int thread_count, int batches)
94 assert(thread_count >= 2);
95 // If there are fewer batch entries than the number of threads we want to use,
96 // then better do intra-batch-entry multithreading.
97 if (batches < thread_count)
101 // If there are at least 2 batch entries to be handed to each thread, then
102 // it's safe to proceed with batch-wise multithreading: each thread will have
103 // approximately equal number of batch entries to handle, so the load
104 // balancing will be reasonable, and the amount to which the load is not
105 // perfectly balanced will be offset by the inherent advantages of
106 // batch-wise multithreading (each thread is more efficient thanks to working
107 // on larger buffers with less boundary-handling overhead).
108 if (batches >= 2 * thread_count)
112 // In the limit case were there are at least 1 but not much more than 1
113 // batch entries per thread, it may be a good idea to do per-batch
114 // multithreading if the number of batch entries is a multiple of the number
115 // of threads, so that each thread will have the same number of batch entries
117 return ((batches % thread_count) == 0);
120 template <typename T, typename TS>
121 inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape,
122 const T *input_data, const Shape &filter_shape, const T *filter_data,
123 const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
124 T *output_data, ruy::Context *ruy_context)
126 assert(input_shape.DimensionsCount() == 4);
127 assert(filter_shape.DimensionsCount() == 4);
128 assert(output_shape.DimensionsCount() == 4);
130 int thread_count = HowManyConvThreads(output_shape, filter_shape);
132 // NOTE Borrow RuyContext to get max_num_threads setting
133 // TODO Define and use max_num_threads for CPU backend
134 const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads();
136 thread_count = std::max(1, std::min(thread_count, max_threads));
137 // Cap the number of threads to 2 for float path to avoid regression in
138 // performance (b/132294857).
139 if (std::is_floating_point<T>::value)
141 thread_count = std::min(thread_count, 2);
144 const int output_batches = output_shape.Dims(0);
145 const int output_height = output_shape.Dims(1);
147 if (thread_count == 1)
149 optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
150 bias_shape, bias_data, output_shape, output_data, 0, output_height,
155 int thread_dim, thread_dim_size;
156 if (MultithreadAlongBatches(thread_count, output_batches))
159 thread_dim_size = output_batches;
164 thread_dim_size = output_height;
167 std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
168 // TODO(b/131746020) don't create new heap allocations every time.
169 // At least we make it a single heap allocation by using reserve().
170 tasks.reserve(thread_count);
171 int thread_start = 0;
172 for (int i = 0; i < thread_count; ++i)
174 int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
175 tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
176 bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
177 thread_start = thread_end;
179 cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
185 #endif // __NNFW_CKER_DEPTHWISE_CONV_H__