2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_OPTIMIZED_CONV_H__
19 #define __NNFW_CKER_OPTIMIZED_CONV_H__
21 #include "OptimizedUtils.h"
23 #include "cker/eigen/EigenSupport.h"
24 #include "cker/eigen/Utils.h"
25 #include "cker/gemmlowp/GEMMSupport.h"
26 #include "cker/neon/neon_check.h"
27 #include "cker/operation/Common.h"
28 #include "cker/Shape.h"
29 #include "cker/Types.h"
31 #include <public/gemmlowp.h>
32 #include <public/map.h>
33 #include <fixedpoint/fixedpoint.h>
45 struct GemmlowpOutputPipeline
47 typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
48 typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
49 gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
50 gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
52 static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset,
53 int32_t output_multiplier, int output_left_shift,
54 int32_t output_activation_min, int32_t output_activation_max)
56 ColVectorMap bias_vector(bias_data, output_rows);
57 gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
58 bias_addition_stage.bias_vector = bias_vector;
59 gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
60 quantize_down_stage.result_offset_after_shift = output_offset;
61 quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
62 quantize_down_stage.result_exponent = output_left_shift;
63 gemmlowp::OutputStageClamp clamp_stage;
64 clamp_stage.min = output_activation_min;
65 clamp_stage.max = output_activation_max;
66 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
67 return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage,
68 saturating_cast_stage);
72 inline void AddBiasAndEvalActivationFunction(float output_activation_min,
73 float output_activation_max, const Shape &bias_shape,
74 const float *bias_data, const Shape &array_shape,
77 BiasAndClamp(output_activation_min, output_activation_max, bias_shape.FlatSize(), bias_data,
78 array_shape.FlatSize(), array_data);
81 inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8_t *input_data,
82 const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape,
83 const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data,
84 const Shape &im2col_shape, uint8_t *im2col_data)
86 gemmlowp::GemmContext *gemm_context = gemm_support::GetGemmLowpContext();
88 const int stride_width = params.stride_width;
89 const int stride_height = params.stride_height;
90 const int dilation_width_factor = params.dilation_width_factor;
91 const int dilation_height_factor = params.dilation_height_factor;
92 const int32_t input_offset = params.input_offset;
93 const int32_t filter_offset = params.weights_offset;
94 const int32_t output_offset = params.output_offset;
95 const int32_t output_multiplier = params.output_multiplier;
96 const int output_shift = params.output_shift;
97 const int32_t output_activation_min = params.quantized_activation_min;
98 const int32_t output_activation_max = params.quantized_activation_max;
99 assert(input_shape.DimensionsCount() == 4);
100 assert(filter_shape.DimensionsCount() == 4);
101 assert(output_shape.DimensionsCount() == 4);
103 const uint8_t *gemm_input_data = nullptr;
104 const Shape *gemm_input_shape = nullptr;
105 const int filter_width = filter_shape.Dims(2);
106 const int filter_height = filter_shape.Dims(1);
107 const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
108 const bool need_im2col =
109 stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
110 if (need_dilated_im2col)
113 const int input_zero_point = -input_offset;
114 assert(input_zero_point >= 0);
115 assert(input_zero_point <= 255);
116 DilatedIm2col(params, input_zero_point, input_shape, input_data, filter_shape, output_shape,
118 gemm_input_data = im2col_data;
119 gemm_input_shape = &im2col_shape;
121 else if (need_im2col)
124 const int input_zero_point = -input_offset;
125 assert(input_zero_point >= 0);
126 assert(input_zero_point <= 255);
127 Im2col(params, filter_height, filter_width, input_zero_point, input_shape, input_data,
128 im2col_shape, im2col_data);
129 gemm_input_data = im2col_data;
130 gemm_input_shape = &im2col_shape;
134 gemm_input_data = input_data;
135 gemm_input_shape = &input_shape;
138 const int gemm_input_rows = gemm_input_shape->Dims(3);
139 // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784).
140 // The root cause has not yet been identified though. Same applies below for
141 // the other calls commented out. This is a partial rollback of cl/196819423.
142 // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
143 const int gemm_input_cols =
144 gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
145 const int filter_rows = filter_shape.Dims(0);
147 // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
148 const int filter_cols = filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3);
149 const int output_rows = output_shape.Dims(3);
151 // const int output_cols = FlatSizeSkipDim(output_shape, 3);
152 const int output_cols = output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
153 assert(output_rows == filter_rows);
154 assert(output_cols == gemm_input_cols);
155 assert(filter_cols == gemm_input_rows);
156 assert(bias_shape.FlatSize() == output_rows);
157 UNUSED_RELEASE(bias_shape);
158 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix(
159 filter_data, filter_rows, filter_cols);
160 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
161 gemm_input_data, gemm_input_rows, gemm_input_cols);
162 gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows,
164 const auto &output_pipeline =
165 GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
166 output_shift, output_activation_min, output_activation_max);
167 gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
168 gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
172 } // namespace optimized
174 namespace multithreaded
178 template <class T> class EigenTensorConvFunctor
181 Eigen::PaddingType RuntimePadding2EigenPadding(PaddingType padding)
185 case PaddingType::kValid:
186 return Eigen::PADDING_VALID;
187 case PaddingType::kSame:
188 return Eigen::PADDING_SAME;
189 case PaddingType::kNone:
190 assert(false); // should never get here.
191 return Eigen::PADDING_VALID;
193 return Eigen::PADDING_SAME; // Prevent compiler warning about missing
198 void operator()(const Eigen::ThreadPoolDevice &device, const T *input_data, int input_batches,
199 int input_height, int input_width, int input_depth, const T *filter_data,
200 int filter_height, int filter_width, int filter_count, int stride_rows,
201 int stride_cols, int pad_height, int pad_width, nnfw::cker::PaddingType padding,
202 T *output_data, int output_height, int output_width)
204 const bool is_1x1_kernel =
205 (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
206 const bool is_same_height_width =
207 (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
209 if (is_1x1_kernel || is_same_height_width)
211 // is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication.
212 // - output (input_batches * conv_width, filter_count)
213 // - input (input_batches * conv_width, input_depth)
214 // - filter (input_depth, filter_count)
215 // is_same_height_width: If the input data and filter have the same height/width, the 2D
216 // convolution is reduced to matrix multiplication.
217 // - output (input_batches, filter_count)
218 // - input (input_batches, filter_width * filter_height * input_depth)
219 // - filter (filter_width * filter_height * input_depth, filter_count)
220 const int conv_width = output_height * output_width;
221 int io_col = input_batches;
222 int filter_col = input_depth * filter_width * filter_height;
225 io_col *= conv_width;
227 Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
228 dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
229 eigen_support::EigenMatrix output(output_data, io_col, filter_count);
230 eigen_support::ConstEigenMatrix input(input_data, io_col, filter_col);
231 eigen_support::ConstEigenMatrix filter(filter_data, filter_col, filter_count);
232 eigen_support::MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input, filter,
237 eigen_support::EigenTensor output(output_data, input_batches, output_height, output_width,
239 eigen_support::ConstEigenTensor input(input_data, input_batches, input_height, input_width,
241 eigen_support::ConstEigenTensor filter(filter_data, filter_height, filter_width, input_depth,
243 output.device(device) = Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows,
244 RuntimePadding2EigenPadding(padding));
250 inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const float *input_data,
251 const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
252 const float *bias_data, const Shape &output_shape, float *output_data)
254 const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice();
256 const int stride_width = params.stride_width;
257 const int stride_height = params.stride_height;
258 const PaddingType padding = params.padding_type;
259 const int pad_width = params.padding_values.width;
260 const int pad_height = params.padding_values.height;
261 const float output_activation_min = params.float_activation_min;
262 const float output_activation_max = params.float_activation_max;
263 assert(input_shape.DimensionsCount() == 4);
264 assert(filter_shape.DimensionsCount() == 4);
265 assert(output_shape.DimensionsCount() == 4);
267 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
268 const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
269 const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
270 const int input_height = input_shape.Dims(1);
271 const int input_width = input_shape.Dims(2);
272 const int filter_height = filter_shape.Dims(1);
273 const int filter_width = filter_shape.Dims(2);
274 const int output_height = output_shape.Dims(1);
275 const int output_width = output_shape.Dims(2);
277 EigenTensorConvFunctor<float> conv_functor;
278 conv_functor(device, input_data, batches, input_height, input_width, input_depth, filter_data,
279 filter_height, filter_width, output_depth, stride_height, stride_width, pad_height,
280 pad_width, padding, output_data, output_height, output_width);
282 optimized::AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
283 bias_shape, bias_data, output_shape, output_data);
286 } // namespace multithreaded
290 #endif // __NNFW_CKER_OPTIMIZED_CONV_H__