From 885e4936930d5ff0ab598022de8367355c1cd2ea Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EA=B9=80=EC=88=98=EC=A7=84/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?=
 =?utf8?q?=EC=9E=90?= <sjsujin.kim@samsung.com>
Date: Mon, 16 Jul 2018 17:29:16 +0900
Subject: [PATCH] [newrt] Implement SoftMax kenrel for CPU (#1968)

This commit implements SoftMax kernel for CPU in new runtime.

Add others
  - QuantizeMultiplierGreaterThanOne, CalculateInputRadius

Signed-off-by: sjsujinkim <sjsujin.kim@samsung.com>
---
 .../new_runtime/src/internal/cpu/StageGenerator.cc |  43 ++++++++-
 .../internal/kernels/cpufallback/OperationUtils.cc |  28 ++++++
 .../internal/kernels/cpufallback/OperationUtils.h  |   4 +
 .../internal/kernels/cpufallback/SoftMaxLayer.cc   | 106 +++++++++++++++++++++
 .../internal/kernels/cpufallback/SoftMaxLayer.h    |  52 ++++++++++
 5 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.cc
 create mode 100644 runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.h
diff --git a/runtimes/new_runtime/src/internal/cpu/StageGenerator.cc b/runtimes/new_runtime/src/internal/cpu/StageGenerator.cc
index 19319f8..f18f9ef 100644
--- a/runtimes/new_runtime/src/internal/cpu/StageGenerator.cc
+++ b/runtimes/new_runtime/src/internal/cpu/StageGenerator.cc
@@ -9,6 +9,7 @@
 #include "internal/kernels/cpufallback/ConcatLayer.h"
 #include "internal/kernels/cpufallback/FullyConnectedLayer.h"
 #include "internal/kernels/cpufallback/ReshapeLayer.h"
+#include "internal/kernels/cpufallback/SoftMaxLayer.h"
 
 #include "logging.h"
 
@@ -459,7 +460,47 @@ Stage StageGenerator::generate(const ::internal::tflite::op::Reshape::Node &node
 
 Stage StageGenerator::generate(const ::internal::tflite::op::Softmax::Node &node)
 {
-  throw std::runtime_error("NYI");
+  VERBOSE(Softmax) << "generate CPU Softmax" << std::endl;
+
+  const ::internal::tflite::operand::Index output_index{node.param().output_index};
+  const ::internal::tflite::operand::Index input_index{node.param().input_index};
+  const ::internal::tflite::operand::Index scale_index{node.param().scale_index};
+
+  struct Param
+  {
+    int output_index;
+    int input_index;
+
+    ::internal::tflite::operand::Shape ofm_shape{1};
+    ::internal::tflite::operand::Shape ifm_shape{1};
+
+    float scale;
+  };
+
+  Param param;
+
+  param.output_index = output_index.asInt();
+  param.input_index = input_index.asInt();
+
+  param.ofm_shape = _ctx.at(output_index).shape();
+  param.ifm_shape = _ctx.at(input_index).shape();
+
+  param.scale = _ctx.at(scale_index).asScalar<float>();
+
+  auto tensors = _tensor_builder;
+
+  return [tensors, param](IExecutionBuilder &builder) {
+    auto output_alloc = tensors->at(::internal::tflite::operand::Index{param.output_index}).get();
+    auto input_alloc = tensors->at(::internal::tflite::operand::Index{param.input_index}).get();
+
+    std::unique_ptr<::internal::kernels::cpu::SoftMaxLayer> fn{
+        new ::internal::kernels::cpu::SoftMaxLayer};
+
+    fn->configure(input_alloc->buffer(), param.ifm_shape, param.scale, output_alloc->buffer(),
+                  param.ofm_shape);
+
+    builder.append(std::move(fn));
+  };
 }
 
 } // namespace stage
diff --git a/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.cc b/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.cc
index 062f1fd..ccb0bc2 100644
--- a/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.cc
+++ b/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.cc
@@ -76,6 +76,24 @@ bool GetQuantizedConvolutionMultipler(const Shape &inputShape, const Shape &filt
   return true;
 }
 
+bool QuantizeMultiplierGreaterThanOne(double double_multiplier, int32_t *quantized_multiplier,
+                                      int *left_shift)
+{
+  assert(double_multiplier > 1.);
+  const double q = std::frexp(double_multiplier, left_shift);
+  int64_t q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+  assert(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31))
+  {
+    q_fixed /= 2;
+    ++*left_shift;
+  }
+  assert(*left_shift >= 0);
+  assert(q_fixed <= std::numeric_limits<int32_t>::max());
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+  return true;
+}
+
 void CalculateActivationRangeFloat(int32_t activation, float *activation_min, float *activation_max)
 {
   if (activation == ANEURALNETWORKS_FUSED_RELU)
@@ -140,6 +158,16 @@ void CalculateActivationRangeUint8(int32_t activation, const Shape &outputShape,
   }
 }
 
+int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift)
+{
+  const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
+                                    (1ll << (31 - input_integer_bits)) / (1ll << input_left_shift);
+  // Tighten bound using floor.  Suppose that we could use the exact value.
+  // After scaling the difference, the result would be at the maximum.  Thus we
+  // must ensure that our value has lower magnitude.
+  return static_cast<int32_t>(std::floor(max_input_rescaled));
+}
+
 Shape convertShape(const ::internal::tflite::operand::Shape &o)
 {
   Shape shape;
diff --git a/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.h b/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.h
index 6660941..3be938d 100644
--- a/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.h
+++ b/runtimes/new_runtime/src/internal/kernels/cpufallback/OperationUtils.h
@@ -74,6 +74,8 @@ __wur bool QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *q
 __wur bool GetQuantizedConvolutionMultipler(const Shape &inputShape, const Shape &filterShape,
                                             const Shape &biasShape, const Shape &outputShape,
                                             float *multiplier);
+__wur bool QuantizeMultiplierGreaterThanOne(double double_multiplier, int32_t *quantized_multiplier,
+                                            int *left_shift);
 
 void CalculateActivationRangeFloat(int32_t activation, float *activation_min,
                                    float *activation_max);
@@ -81,6 +83,8 @@ void CalculateActivationRangeFloat(int32_t activation, float *activation_min,
 void CalculateActivationRangeUint8(int32_t activation, const Shape &outputShape, int32_t *act_min,
                                    int32_t *act_max);
 
+int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift);
+
 Shape convertShape(const ::internal::tflite::operand::Shape &o);
 
 uint32_t sizeOfData(OperandType type, const std::vector<uint32_t> &dimensions);
diff --git a/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.cc b/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.cc
new file mode 100644
index 0000000..bd21fb7
--- /dev/null
+++ b/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.cc
@@ -0,0 +1,106 @@
+#include "SoftMaxLayer.h"
+
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "internal/kernels/cpufallback/OperationUtils.h"
+
+namespace internal
+{
+namespace kernels
+{
+namespace cpu
+{
+
+bool SoftMaxLayer::softmaxFloat32()
+{
+  ::tflite::Dims<4> dim;
+  if (getNumberOfDimensions(_inputShape) == 2)
+  {
+    uint32_t batch_size = getSizeOfDimension(_inputShape, 0);
+    uint32_t input_size = getNumberOfElements(_inputShape) / batch_size;
+    Shape shapeIn4D;
+    shapeIn4D.dimensions = {batch_size, 1, 1, input_size};
+    dim = convertShapeToDims(shapeIn4D);
+  }
+  else if (getNumberOfDimensions(_inputShape) == 4)
+  {
+    dim = convertShapeToDims(_inputShape);
+  }
+  else
+  {
+    std::cout << "only 2D and 4D tensors supported" << std::endl;
+    return false;
+  }
+  ::tflite::optimized_ops::Softmax(reinterpret_cast<const float *>(_inputData), dim, _beta,
+                                   reinterpret_cast<float *>(_outputData), dim);
+  return true;
+}
+
+bool SoftMaxLayer::softmaxQuant8()
+{
+  ::tflite::Dims<4> dim;
+  if (getNumberOfDimensions(_inputShape) == 2)
+  {
+    uint32_t batch_size = getSizeOfDimension(_inputShape, 0);
+    uint32_t input_size = getNumberOfElements(_inputShape) / batch_size;
+    Shape shapeIn4D;
+    shapeIn4D.dimensions = {batch_size, 1, 1, input_size};
+    dim = convertShapeToDims(shapeIn4D);
+  }
+  else if (getNumberOfDimensions(_inputShape) == 4)
+  {
+    dim = convertShapeToDims(_inputShape);
+  }
+  else
+  {
+    std::cout << "only 2D and 4D tensors supported" << std::endl;
+    return false;
+  }
+  if (_outputShape.offset != 0 || _outputShape.scale != 1.f / 256)
+  {
+    std::cout << "incorrect scale / offset for output" << std::endl;
+    return false;
+  }
+  static const int32_t kScaledDiffIntegerBits = 5;
+  const double input_beta_real_multiplier = std::min(
+      1.0 * _beta * _inputShape.scale * (1 << (31 - kScaledDiffIntegerBits)), (1ll << 31) - 1.0);
+  int32_t input_multiplier = 0;
+  int32_t input_left_shift = 0;
+  if (!QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, &input_multiplier,
+                                        &input_left_shift))
+  {
+    return false;
+  }
+  float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits, input_left_shift);
+  ::tflite::optimized_ops::Softmax(_inputData, dim, input_multiplier, input_left_shift, diff_min,
+                                   _outputData, dim);
+  return true;
+}
+
+void SoftMaxLayer::configure(uint8_t *inputData,
+                             const ::internal::tflite::operand::Shape &inputShape, const float beta,
+                             uint8_t *outputData,
+                             const ::internal::tflite::operand::Shape &outputShape)
+{
+  _inputData = inputData;
+  _inputShape = convertShape(inputShape);
+  _inputType = inputShape.type();
+  _outputData = outputData;
+  _outputShape = convertShape(outputShape);
+  _beta = beta;
+}
+
+void SoftMaxLayer::run()
+{
+  if (_inputType == static_cast<uint32_t>(OperandType::TENSOR_FLOAT32))
+  {
+    softmaxFloat32();
+  }
+  else if (_inputType == static_cast<uint32_t>(OperandType::TENSOR_QUANT8_ASYMM))
+  {
+    softmaxQuant8();
+  }
+}
+
+} // namespace cpu
+} // namespace kernels
+} // namespace internal
diff --git a/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.h b/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.h
new file mode 100644
index 0000000..042a619
--- /dev/null
+++ b/runtimes/new_runtime/src/internal/kernels/cpufallback/SoftMaxLayer.h
@@ -0,0 +1,52 @@
+#ifndef __INTERNAL_KERNELS_CPU_SOFTMAXLAYER_H__
+#define __INTERNAL_KERNELS_CPU_SOFTMAXLAYER_H__
+
+#include <NeuralNetworks.h>
+
+#include <arm_compute/runtime/IFunction.h>
+
+#include "internal/Model.h"
+#include "internal/kernels/cpufallback/OperationUtils.h"
+
+using namespace internal::kernels::cpu;
+
+namespace internal
+{
+namespace kernels
+{
+namespace cpu
+{
+
+class SoftMaxLayer : public ::arm_compute::IFunction
+{
+public:
+  SoftMaxLayer() {}
+
+public:
+  bool softmaxFloat32();
+
+  bool softmaxQuant8();
+
+  void configure(uint8_t *inputData, const ::internal::tflite::operand::Shape &inputShape,
+                 const float beta, uint8_t *outputData,
+                 const ::internal::tflite::operand::Shape &outputShape);
+
+  void run();
+
+private:
+  uint8_t *_inputData;
+  uint8_t *_outputData;
+
+  float _beta;
+
+  Shape _inputShape;
+  Shape _outputShape;
+
+  int32_t _inputType;
+};
+
+} // namespace cpu
+} // namespace kernels
+} // namespace internal
+
+#endif // __INTERNAL_KERNELS_CPU_SOFTMAXLAYER_H__
-- 
2.7.4