From 885e4936930d5ff0ab598022de8367355c1cd2ea Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EA=B9=80=EC=88=98=EC=A7=84/=EB=8F=99=EC=9E=91=EC=A0=9C?= =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?= =?utf8?q?=EC=9E=90?= Date: Mon, 16 Jul 2018 17:29:16 +0900 Subject: [PATCH] [newrt] Implement SoftMax kenrel for CPU (#1968) This commit implements SoftMax kernel for CPU in new runtime. Add others - QuantizeMultiplierGreaterThanOne, CalculateInputRadius Signed-off-by: sjsujinkim --- .../new_runtime/src/internal/cpu/StageGenerator.cc | 43 ++++++++- .../internal/kernels/cpufallback/OperationUtils.cc | 28 ++++++ .../internal/kernels/cpufallback/OperationUtils.h | 4 + .../internal/kernels/cpufallback/SoftMaxLayer.cc | 106 +++++++++++++++++++++ .../internal/kernels/cpufallback/SoftMaxLayer.h | 52 ++++++++++ 5 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.cc create mode 100644 runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.h diff --git a/runtimes/new_runtime/src/internal/cpu/StageGenerator.cc b/runtimes/new_runtime/src/internal/cpu/StageGenerator.cc index 19319f8..f18f9ef 100644 --- a/runtimes/new_runtime/src/internal/cpu/StageGenerator.cc +++ b/runtimes/new_runtime/src/internal/cpu/StageGenerator.cc @@ -9,6 +9,7 @@ #include "internal/kernels/cpufallback/ConcatLayer.h" #include "internal/kernels/cpufallback/FullyConnectedLayer.h" #include "internal/kernels/cpufallback/ReshapeLayer.h" +#include "internal/kernels/cpufallback/SoftMaxLayer.h" #include "logging.h" @@ -459,7 +460,47 @@ Stage StageGenerator::generate(const ::internal::tflite::op::Reshape::Node &node Stage StageGenerator::generate(const ::internal::tflite::op::Softmax::Node &node) { - throw std::runtime_error("NYI"); + VERBOSE(Softmax) << "generate CPU Softmax" << std::endl; + + const ::internal::tflite::operand::Index output_index{node.param().output_index}; + const ::internal::tflite::operand::Index input_index{node.param().input_index}; + const ::internal::tflite::operand::Index scale_index{node.param().scale_index}; + + struct Param + { + int output_index; + int input_index; + + ::internal::tflite::operand::Shape ofm_shape{1}; + ::internal::tflite::operand::Shape ifm_shape{1}; + + float scale; + }; + + Param param; + + param.output_index = output_index.asInt(); + param.input_index = input_index.asInt(); + + param.ofm_shape = _ctx.at(output_index).shape(); + param.ifm_shape = _ctx.at(input_index).shape(); + + param.scale = _ctx.at(scale_index).asScalar(); + + auto tensors = _tensor_builder; + + return [tensors, param](IExecutionBuilder &builder) { + auto output_alloc = tensors->at(::internal::tflite::operand::Index{param.output_index}).get(); + auto input_alloc = tensors->at(::internal::tflite::operand::Index{param.input_index}).get(); + + std::unique_ptr<::internal::kernels::cpu::SoftMaxLayer> fn{ + new ::internal::kernels::cpu::SoftMaxLayer}; + + fn->configure(input_alloc->buffer(), param.ifm_shape, param.scale, output_alloc->buffer(), + param.ofm_shape); + + builder.append(std::move(fn)); + }; } } // namespace stage diff --git a/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.cc b/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.cc index 062f1fd..ccb0bc2 100644 --- a/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.cc +++ b/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.cc @@ -76,6 +76,24 @@ bool GetQuantizedConvolutionMultipler(const Shape &inputShape, const Shape &filt return true; } +bool QuantizeMultiplierGreaterThanOne(double double_multiplier, int32_t *quantized_multiplier, + int *left_shift) +{ + assert(double_multiplier > 1.); + const double q = std::frexp(double_multiplier, left_shift); + int64_t q_fixed = static_cast(std::round(q * (1ll << 31))); + assert(q_fixed <= (1ll << 31)); + if (q_fixed == (1ll << 31)) + { + q_fixed /= 2; + ++*left_shift; + } + assert(*left_shift >= 0); + assert(q_fixed <= std::numeric_limits::max()); + *quantized_multiplier = static_cast(q_fixed); + return true; +} + void CalculateActivationRangeFloat(int32_t activation, float *activation_min, float *activation_max) { if (activation == ANEURALNETWORKS_FUSED_RELU) @@ -140,6 +158,16 @@ void CalculateActivationRangeUint8(int32_t activation, const Shape &outputShape, } } +int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift) +{ + const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) * + (1ll << (31 - input_integer_bits)) / (1ll << input_left_shift); + // Tighten bound using floor. Suppose that we could use the exact value. + // After scaling the difference, the result would be at the maximum. Thus we + // must ensure that our value has lower magnitude. + return static_cast(std::floor(max_input_rescaled)); +} + Shape convertShape(const ::internal::tflite::operand::Shape &o) { Shape shape; diff --git a/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.h b/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.h index 6660941..3be938d 100644 --- a/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.h +++ b/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.h @@ -74,6 +74,8 @@ __wur bool QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *q __wur bool GetQuantizedConvolutionMultipler(const Shape &inputShape, const Shape &filterShape, const Shape &biasShape, const Shape &outputShape, float *multiplier); +__wur bool QuantizeMultiplierGreaterThanOne(double double_multiplier, int32_t *quantized_multiplier, + int *left_shift); void CalculateActivationRangeFloat(int32_t activation, float *activation_min, float *activation_max); @@ -81,6 +83,8 @@ void CalculateActivationRangeFloat(int32_t activation, float *activation_min, void CalculateActivationRangeUint8(int32_t activation, const Shape &outputShape, int32_t *act_min, int32_t *act_max); +int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift); + Shape convertShape(const ::internal::tflite::operand::Shape &o); uint32_t sizeOfData(OperandType type, const std::vector &dimensions); diff --git a/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.cc b/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.cc new file mode 100644 index 0000000..bd21fb7 --- /dev/null +++ b/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.cc @@ -0,0 +1,106 @@ +#include "SoftMaxLayer.h" + +#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h" +#include "internal/kernels/cpufallback/OperationUtils.h" + +namespace internal +{ +namespace kernels +{ +namespace cpu +{ + +bool SoftMaxLayer::softmaxFloat32() +{ + ::tflite::Dims<4> dim; + if (getNumberOfDimensions(_inputShape) == 2) + { + uint32_t batch_size = getSizeOfDimension(_inputShape, 0); + uint32_t input_size = getNumberOfElements(_inputShape) / batch_size; + Shape shapeIn4D; + shapeIn4D.dimensions = {batch_size, 1, 1, input_size}; + dim = convertShapeToDims(shapeIn4D); + } + else if (getNumberOfDimensions(_inputShape) == 4) + { + dim = convertShapeToDims(_inputShape); + } + else + { + std::cout << "only 2D and 4D tensors supported" << std::endl; + return false; + } + ::tflite::optimized_ops::Softmax(reinterpret_cast(_inputData), dim, _beta, + reinterpret_cast(_outputData), dim); + return true; +} + +bool SoftMaxLayer::softmaxQuant8() +{ + ::tflite::Dims<4> dim; + if (getNumberOfDimensions(_inputShape) == 2) + { + uint32_t batch_size = getSizeOfDimension(_inputShape, 0); + uint32_t input_size = getNumberOfElements(_inputShape) / batch_size; + Shape shapeIn4D; + shapeIn4D.dimensions = {batch_size, 1, 1, input_size}; + dim = convertShapeToDims(shapeIn4D); + } + else if (getNumberOfDimensions(_inputShape) == 4) + { + dim = convertShapeToDims(_inputShape); + } + else + { + std::cout << "only 2D and 4D tensors supported" << std::endl; + return false; + } + if (_outputShape.offset != 0 || _outputShape.scale != 1.f / 256) + { + std::cout << "incorrect scale / offset for output" << std::endl; + return false; + } + static const int32_t kScaledDiffIntegerBits = 5; + const double input_beta_real_multiplier = std::min( + 1.0 * _beta * _inputShape.scale * (1 << (31 - kScaledDiffIntegerBits)), (1ll << 31) - 1.0); + int32_t input_multiplier = 0; + int32_t input_left_shift = 0; + if (!QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, &input_multiplier, + &input_left_shift)) + { + return false; + } + float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits, input_left_shift); + ::tflite::optimized_ops::Softmax(_inputData, dim, input_multiplier, input_left_shift, diff_min, + _outputData, dim); + return true; +} + +void SoftMaxLayer::configure(uint8_t *inputData, + const ::internal::tflite::operand::Shape &inputShape, const float beta, + uint8_t *outputData, + const ::internal::tflite::operand::Shape &outputShape) +{ + _inputData = inputData; + _inputShape = convertShape(inputShape); + _inputType = inputShape.type(); + _outputData = outputData; + _outputShape = convertShape(outputShape); + _beta = beta; +} + +void SoftMaxLayer::run() +{ + if (_inputType == static_cast(OperandType::TENSOR_FLOAT32)) + { + softmaxFloat32(); + } + else if (_inputType == static_cast(OperandType::TENSOR_QUANT8_ASYMM)) + { + softmaxQuant8(); + } +} + +} // namespace cpu +} // namespace kernels +} // namespace internal diff --git a/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.h b/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.h new file mode 100644 index 0000000..042a619 --- /dev/null +++ b/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.h @@ -0,0 +1,52 @@ +#ifndef __INTERNAL_KERNELS_CPU_SOFTMAXLAYER_H__ +#define __INTERNAL_KERNELS_CPU_SOFTMAXLAYER_H__ + +#include + +#include + +#include "internal/Model.h" +#include "internal/kernels/cpufallback/OperationUtils.h" + +using namespace internal::kernels::cpu; + +namespace internal +{ +namespace kernels +{ +namespace cpu +{ + +class SoftMaxLayer : public ::arm_compute::IFunction +{ +public: + SoftMaxLayer() {} + +public: + bool softmaxFloat32(); + + bool softmaxQuant8(); + + void configure(uint8_t *inputData, const ::internal::tflite::operand::Shape &inputShape, + const float beta, uint8_t *outputData, + const ::internal::tflite::operand::Shape &outputShape); + + void run(); + +private: + uint8_t *_inputData; + uint8_t *_outputData; + + float _beta; + + Shape _inputShape; + Shape _outputShape; + + int32_t _inputType; +}; + +} // namespace cpu +} // namespace kernels +} // namespace internal + +#endif // __INTERNAL_KERNELS_CPU_SOFTMAXLAYER_H__ -- 2.7.4