From 2ade35e42320121d583e907d4737fe29a903f6c8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EC=98=A4=ED=98=95=EC=84=9D/On-Device=20Lab=28SR=29/Staff?= =?utf8?q?=20Engineer/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Mon, 1 Apr 2019 19:08:26 +0900 Subject: [PATCH] Introduce cpu quant8 convolution kernel (#4910) Introduce cpu quantized int8 convolution kernel from tflite and gemmlowp Use kernel in neurun cpu backend Signed-off-by: Hyeongseok Oh --- libs/cker/README.md | 2 +- libs/cker/include/cker/FixedPoint.h | 57 ++++++++++ libs/cker/include/cker/Utils.h | 10 ++ libs/cker/include/cker/operation/Conv.h | 83 ++++++++++++++ .../neurun/backend/cpu/kernel/ConvolutionLayer.cc | 125 +++++---------------- .../neurun/backend/cpu/kernel/OperationUtils.cc | 23 ++++ .../neurun/backend/cpu/kernel/OperationUtils.h | 2 + 7 files changed, 202 insertions(+), 100 deletions(-) create mode 100644 libs/cker/include/cker/FixedPoint.h diff --git a/libs/cker/README.md b/libs/cker/README.md index 5b6f98e..149320f 100644 --- a/libs/cker/README.md +++ b/libs/cker/README.md @@ -4,4 +4,4 @@ cker - Portable CPU kernel library __cker__ means `CPU kernel` -Current __cker__ is porting of Tensorflow lite's reference_op kernel (Tensorflow 1.12) +Current __cker__ is porting of Tensorflow lite's reference_op kernel (Tensorflow 1.12) and gemmlow diff --git a/libs/cker/include/cker/FixedPoint.h b/libs/cker/include/cker/FixedPoint.h new file mode 100644 index 0000000..33178a9 --- /dev/null +++ b/libs/cker/include/cker/FixedPoint.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The Gemmlowp Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_FXIED_POINT_H__ +#define __NNFW_CKER_FXIED_POINT_H__ + +#include +#include + +namespace nnfw +{ +namespace cker +{ + +inline int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b) +{ + bool overflow = a == b && a == std::numeric_limits::min(); + int64_t a_64(a); + int64_t b_64(b); + int64_t ab_64 = a_64 * b_64; + int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30)); + int32_t ab_x2_high32 = static_cast((ab_64 + nudge) / (1ll << 31)); + return overflow ? std::numeric_limits::max() : ab_x2_high32; +} + +// Correctly-rounded-to-nearest division by a power-of-two. +// Also known as a rounding arithmetic right shift. +inline int32_t RoundingDivideByPOT(int32_t x, int exponent) +{ + assert(exponent >= 0); + assert(exponent <= 31); + const int32_t mask = ((1ll << exponent) - 1); + const int32_t zero = 0; + const int32_t one = 1; + const int32_t remainder = x & mask; + const int32_t threshold = (mask >> 1) + ((x < zero ? ~zero : zero) & one); + return ((x >> exponent) + (((remainder > threshold) ? ~zero : zero) & one)); +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_FXIED_POINT_H__ diff --git a/libs/cker/include/cker/Utils.h b/libs/cker/include/cker/Utils.h index 14d713a..043cb5c 100644 --- a/libs/cker/include/cker/Utils.h +++ b/libs/cker/include/cker/Utils.h @@ -20,6 +20,8 @@ #include +#include "cker/FixedPoint.h" + namespace nnfw { namespace cker @@ -31,6 +33,14 @@ inline float ActivationFunctionWithMinMax(float x, float output_activation_min, return std::min(std::max(x, output_activation_min), output_activation_max); } +inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift) +{ + int left_shift = shift > 0 ? shift : 0; + int right_shift = shift > 0 ? 0 : -shift; + return RoundingDivideByPOT( + SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), right_shift); +} + } // namespace cker } // namespace nnfw diff --git a/libs/cker/include/cker/operation/Conv.h b/libs/cker/include/cker/operation/Conv.h index b14af01..e494f0e 100644 --- a/libs/cker/include/cker/operation/Conv.h +++ b/libs/cker/include/cker/operation/Conv.h @@ -126,6 +126,89 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const float } } +inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); + + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + int32_t input_val = + input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = filter_data[Offset(filter_shape, out_channel, filter_y, + filter_x, in_channel)]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + } + } + } + if (bias_data) + { + acc += bias_data[out_channel]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + static_cast(acc); + } + } + } + } +} + } // namespace cker } // namespace nnfw diff --git a/runtimes/neurun/backend/cpu/kernel/ConvolutionLayer.cc b/runtimes/neurun/backend/cpu/kernel/ConvolutionLayer.cc index 675e05e..672ff58 100644 --- a/runtimes/neurun/backend/cpu/kernel/ConvolutionLayer.cc +++ b/runtimes/neurun/backend/cpu/kernel/ConvolutionLayer.cc @@ -18,12 +18,8 @@ #include -// TODO : Discard legacy methods -#include "tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h" #include "OperationUtils.h" -#include - namespace neurun { namespace backend @@ -32,64 +28,6 @@ namespace cpu { namespace kernel { - -// If possible we will use this static buffer for the tensor. -static constexpr int kStaticBufferSize = 1605632; -static char static_scratch_buffer[kStaticBufferSize]; -static std::mutex executionMutex; - -#define ANDROID_NN_CONV_PARAMETERS(Type) \ - uint32_t height = getSizeOfDimension(_inputShape, 1); \ - uint32_t width = getSizeOfDimension(_inputShape, 2); \ - uint32_t kernelHeight = getSizeOfDimension(_kernelShape, 1); \ - uint32_t kernelWidth = getSizeOfDimension(_kernelShape, 2); \ - uint32_t outHeight = getSizeOfDimension(_outputShape, 1); \ - uint32_t outWidth = getSizeOfDimension(_outputShape, 2); \ - uint32_t inDepth = getSizeOfDimension(_inputShape, 3); \ - \ - uint32_t paddingHeight = (uint32_t)_paddingTop; \ - uint32_t paddingWidth = (uint32_t)_paddingLeft; \ - \ - tflite::Dims<4> im2colDim; \ - im2colDim.sizes[3] = (int)getSizeOfDimension(_outputShape, 0); \ - im2colDim.sizes[2] = (int)getSizeOfDimension(_outputShape, 1); \ - im2colDim.sizes[1] = (int)getSizeOfDimension(_outputShape, 2); \ - im2colDim.sizes[0] = (int)inDepth * kernelHeight * kernelWidth; \ - \ - im2colDim.strides[0] = 1; \ - for (int i = 1; i < 4; i++) \ - { \ - im2colDim.strides[i] = im2colDim.strides[i - 1] * im2colDim.sizes[i - 1]; \ - } \ - \ - Type *im2colData = nullptr; \ - uint64_t im2colByteSize = sizeof(Type); \ - std::unique_ptr im2colGuard; \ - for (int i = 0; i < 4; i++) \ - { \ - im2colByteSize *= im2colDim.sizes[i]; \ - } \ - /* http://b/77982879, tflite::optimized_ops::Conv uses int for offsets */ \ - if (im2colByteSize >= 0x7fffffff) \ - { \ - std::cout << "Conv size is too large, not enough memory" << std::endl; \ - return false; \ - } \ - if (im2colByteSize <= kStaticBufferSize) \ - { \ - im2colData = reinterpret_cast(static_scratch_buffer); \ - } \ - else \ - { \ - im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)]; \ - if (im2colData == nullptr) \ - { \ - std::cout << "Conv size is too large, not enough memory" << std::endl; \ - return false; \ - } \ - im2colGuard.reset(im2colData); \ - } - ConvolutionLayer::ConvolutionLayer() : _inputData(), _kernelData(), _outputData(), _biasData(), _inputShape(), _kernelShape(), _outputShape(), _biasShape(), _paddingLeft(0), _paddingTop(0), _paddingRight(0), @@ -123,52 +61,41 @@ bool ConvolutionLayer::convFloat32() bool ConvolutionLayer::convQuant8() { - ANDROID_NN_CONV_PARAMETERS(uint8_t) + int32_t output_activation_min = 0; + int32_t output_activation_max = 0; + CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min, + &output_activation_max); - int32_t inputOffset = -_inputShape.offset; - int32_t kernelOffset = -_kernelShape.offset; - int32_t outputOffset = _outputShape.offset; float real_multiplier = 0.0; int32_t output_multiplier = 0; int32_t output_shift = 0; - int32_t output_activation_min = 0; - int32_t output_activation_max = 0; - - const ::tflite::Dims<4> &kernel_dim = convertShapeToDims(_kernelShape); - const int kernel_width = ArraySize(kernel_dim, 1); - const int kernel_height = ArraySize(kernel_dim, 2); - const bool need_im2col = - _strideWidth != 1 || _strideHeight != 1 || kernel_width != 1 || kernel_height != 1; - - uint8_t *im2colDataToPass = nullptr; - if (need_im2col) - { - im2colDataToPass = im2colData; - } - if (!GetQuantizedConvolutionMultipler(_inputShape, _kernelShape, _biasShape, _outputShape, &real_multiplier) || - !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier, &output_shift)) + !QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift)) { return false; } - CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min, - &output_activation_max); - int32_t dilationWidthFactor = 1, dilationHeightFactor = 1; - - static gemmlowp::GemmContext gemm_context; - // Prevent concurrent executions that may access the scratch buffer and - // gemm_context. - std::unique_lock lock(executionMutex); - // Alow gemmlowp automatically decide how many threads to use. - gemm_context.set_max_num_threads(0); - tflite::optimized_ops::Conv( - _inputData.u8, convertShapeToDims(_inputShape), inputOffset, _kernelData.u8, - convertShapeToDims(_kernelShape), kernelOffset, _biasData.i32, convertShapeToDims(_biasShape), - _strideWidth, _strideHeight, dilationWidthFactor, dilationHeightFactor, paddingWidth, - paddingHeight, outputOffset, output_multiplier, output_shift, output_activation_min, - output_activation_max, _outputData.u8, convertShapeToDims(_outputShape), im2colDataToPass, - im2colDim, &gemm_context); + + nnfw::cker::ConvParams op_params; + op_params.stride_width = _strideWidth; + op_params.stride_height = _strideHeight; + op_params.dilation_width_factor = 1; + op_params.dilation_height_factor = 1; + op_params.padding_values.width = _paddingLeft; + op_params.padding_values.height = _paddingTop; + op_params.input_offset = -_inputShape.offset; + op_params.weights_offset = -_kernelShape.offset; + op_params.output_offset = _outputShape.offset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + + nnfw::cker::Conv(op_params, convertShapeToCkerShape(_inputShape), _inputData.u8, + convertShapeToCkerShape(_kernelShape), _kernelData.u8, + convertShapeToCkerShape(_biasShape), _biasData.i32, + convertShapeToCkerShape(_outputShape), _outputData.u8); + return true; } diff --git a/runtimes/neurun/backend/cpu/kernel/OperationUtils.cc b/runtimes/neurun/backend/cpu/kernel/OperationUtils.cc index db59fa8..5bcc699 100644 --- a/runtimes/neurun/backend/cpu/kernel/OperationUtils.cc +++ b/runtimes/neurun/backend/cpu/kernel/OperationUtils.cc @@ -51,6 +51,29 @@ uint32_t getSizeOfDimension(const Shape &shape, uint32_t dimensionIdx) return shape.dimensions[dimensionIdx]; } +bool QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift) +{ + if (double_multiplier == 0.) + { + *quantized_multiplier = 0; + *shift = 0; + return true; + } + const double q = std::frexp(double_multiplier, shift); + auto q_fixed = static_cast(std::round(q * (1ll << 31))); + + assert(q_fixed <= (1ll << 31)); + if (q_fixed == (1ll << 31)) + { + q_fixed /= 2; + ++*shift; + } + assert(q_fixed <= std::numeric_limits::max()); + *quantized_multiplier = static_cast(q_fixed); + + return true; +} + bool QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *quantized_multiplier, int32_t *right_shift) { diff --git a/runtimes/neurun/backend/cpu/kernel/OperationUtils.h b/runtimes/neurun/backend/cpu/kernel/OperationUtils.h index 6a3c543..95bc16d 100644 --- a/runtimes/neurun/backend/cpu/kernel/OperationUtils.h +++ b/runtimes/neurun/backend/cpu/kernel/OperationUtils.h @@ -175,6 +175,8 @@ inline TfLiteFusedActivation convertFusedActivation(FuseCode act) return kTfLiteActNone; } +bool QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift); + __wur bool QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *quantized_multiplier, int32_t *right_shift); -- 2.7.4