Imported Upstream version 1.7.0

[platform/core/ml/nnfw.git] / compute / cker / include / cker / operation / reference / BinaryArithmeticOps.h
diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h

index 438a671..7a2b896 100644 (file)
--- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
@@ -19,6 +19,7 @@
  #define __NNFW_CKER_REFERENCE_BINARYARITHMETICOPS_H__
  
  #include "cker/Shape.h"
+#include "cker/Types.h"
  #include "cker/Utils.h"
  
  #include <cmath>
@@ -61,6 +62,125 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
    }
  }
  
+template <typename T>
+inline void BroadcastBinaryArithmeticOpSlowQuant8(
+    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
+    const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+  const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+  if ((params.quantized_activation_min < 0) && (params.quantized_activation_max > 255))
+  {
+    throw std::runtime_error{"Support only for Quant8."};
+  }
+
+  // Comment from tensorflow lite:
+  //
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax<uint8_t>(
+                  fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                     input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+                  params.quantized_activation_min, params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+template <typename T>
+inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &params,
+                                            const Shape &input1_shape, const T *input1_data,
+                                            const Shape &input2_shape, const T *input2_data,
+                                            const Shape &output_shape, T *output_data,
+                                            const std::function<T(const T &, const T &)> &fn)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+  const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+  // Comment from tensorflow lite:
+  //
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
+              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+              params.quantized_activation_min, params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+
+template <>
+inline void BroadcastBinaryArithmeticOpSlow(
+    const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
+    const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
+    float *output_data, const std::function<float(const float &, const float &)> &fn)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+  const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
+              fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+              params.float_activation_min, params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
  } // namespace reference
  } // namespace cker
  } // namespace nnfw