* limitations under the License.
*/
-#include "kernels/Add.h"
+#include "Builders.h"
+#include "kernels/Utils.h"
#include "kernels/BinaryOpCommon.h"
-#include "kernels/Utils.h"
-#include <tensorflow/lite/kernels/internal/reference/add.h>
-#include <tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h>
+#include "PALAdd.h"
namespace luci_interpreter
{
-namespace kernels
-{
-
-Add::Add(const Tensor *input1, const Tensor *input2, Tensor *output, const AddParams ¶ms)
- : KernelWithParams<AddParams>({input1, input2}, {output}, params)
-{
-}
-
-void Add::configure()
-{
- LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
- LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type());
- if (input1()->element_type() == DataType::S16)
- {
- LUCI_INTERPRETER_CHECK(input1()->zero_points().size() == 1 &&
- input2()->zero_points().size() == 1);
- LUCI_INTERPRETER_CHECK(input1()->zero_point() == 0 && input2()->zero_point() == 0 &&
- output()->zero_point() == 0);
- }
-
- // TODO: enable it only if kernel with dynamic shapes
- output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
-}
-
-void Add::execute() const
-{
- switch (input1()->element_type())
- {
- case DataType::FLOAT32:
- evalFloat();
- break;
- case DataType::S64:
- evalInteger<int64_t>();
- break;
- case DataType::S32:
- evalInteger<int32_t>();
- break;
- case DataType::U8:
- evalQuantized();
- break;
- case DataType::S16:
- evalQuantizedS16();
- break;
- default:
- assert(false && "Unsupported type.");
- }
-}
-
-void Add::evalFloat() const
-{
- tflite::ArithmeticParams params{};
- fillArithmeticActivationRange<float>(params, _params.activation);
-
- const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
- getTensorShape(input1()), getTensorShape(input2()), ¶ms);
- if (need_broadcast)
- {
- tflite::reference_ops::BroadcastAdd4DSlow(
- params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
- getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
- }
- else
- {
- tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<float>(input1()),
- getTensorShape(input2()), getTensorData<float>(input2()),
- getTensorShape(output()), getTensorData<float>(output()));
- }
-}
-
-template <typename T> void Add::evalInteger() const
+void configure_kernel_CircleAdd(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
{
- tflite::ArithmeticParams params{};
- fillArithmeticActivationRange<T>(params, _params.activation);
+ kernels::TISOKernel kernel(cur_op, runtime_graph);
- const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
- getTensorShape(input1()), getTensorShape(input2()), ¶ms);
+ LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
+ Tensor::element_type(kernel.input2()));
+ LUCI_INTERPRETER_CHECK(Tensor::element_type(kernel.input1()) ==
+ Tensor::element_type(kernel.input2()));
- if (need_broadcast)
+#ifndef DIS_QUANT
+ if (Tensor::element_type(kernel.input1()) == DataType::S16)
{
- tflite::reference_ops::BroadcastAdd4DSlow(
- params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()),
- getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output()));
- }
- else
- {
- tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<T>(input1()),
- getTensorShape(input2()), getTensorData<T>(input2()),
- getTensorShape(output()), getTensorData<T>(output()));
+ LUCI_INTERPRETER_CHECK(Tensor::zero_points(kernel.input1()).size() == 1 &&
+ Tensor::zero_points(kernel.input2()).size() == 1);
+ LUCI_INTERPRETER_CHECK(Tensor::zero_point(kernel.input1()) == 0 &&
+ Tensor::zero_point(kernel.input2()) == 0 &&
+ Tensor::zero_point(kernel.output()) == 0);
}
+#endif // DIS_QUANT
}
-void Add::evalQuantized() const
+void execute_kernel_CircleAdd(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
{
- const auto input1_scale = static_cast<double>(input1()->scale());
- const auto input2_scale = static_cast<double>(input2()->scale());
- const auto output_scale = static_cast<double>(output()->scale());
-
- const int left_shift = 20;
- const double twice_max_input_scale = 2 * std::max(input1_scale, input2_scale);
- const double real_input1_multiplier = input1_scale / twice_max_input_scale;
- const double real_input2_multiplier = input2_scale / twice_max_input_scale;
- const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output_scale);
-
- int32_t input1_multiplier{}, input2_multiplier{}, output_multiplier{};
- int input1_shift{}, input2_shift{}, output_shift{};
- quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &input1_multiplier, &input1_shift);
- quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &input2_multiplier, &input2_shift);
- quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &output_multiplier, &output_shift);
+ kernels::TISOKernel kernel(cur_op, runtime_graph);
- int32_t activation_min{};
- int32_t activation_max{};
- calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+ const auto *options = cur_op->builtin_options_as_AddOptions();
- tflite::ArithmeticParams params{};
- params.left_shift = left_shift;
- // The kernel expects inputs' zero points to be negated.
- params.input1_offset = -input1()->zero_point(); // Note the '-'.
- params.input1_multiplier = input1_multiplier;
- params.input1_shift = input1_shift;
- params.input2_offset = -input2()->zero_point(); // Note the '-'.
- params.input2_multiplier = input2_multiplier;
- params.input2_shift = input2_shift;
- params.output_offset = output()->zero_point();
- params.output_multiplier = output_multiplier;
- params.output_shift = output_shift;
- params.quantized_activation_min = activation_min;
- params.quantized_activation_max = activation_max;
+ luci_interpreter::RuntimeShape input_shape1 =
+ kernels::getTensorRuntimeShape(kernel.input1(), runtime_graph);
+ luci_interpreter::RuntimeShape input_shape2 =
+ kernels::getTensorRuntimeShape(kernel.input2(), runtime_graph);
- const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
- getTensorShape(input1()), getTensorShape(input2()), ¶ms);
+ bool is_inplace = runtime_graph->is_inplace_op(cur_op);
- if (need_broadcast)
+ switch (Tensor::element_type(kernel.input1()))
{
- tflite::reference_ops::BroadcastAdd4DSlow(
- params, getTensorShape(input1()), getTensorData<uint8_t>(input1()), getTensorShape(input2()),
- getTensorData<uint8_t>(input2()), getTensorShape(output()), getTensorData<uint8_t>(output()));
- }
- else
- {
- tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
- getTensorShape(input2()), getTensorData<uint8_t>(input2()),
- getTensorShape(output()), getTensorData<uint8_t>(output()));
+#ifndef DIS_FLOAT
+ case DataType::FLOAT32:
+ {
+ auto tiso_func = luci_interpreter_pal::Add<float>;
+ auto broadcast_tiso_func = luci_interpreter_pal::BroadcastAdd4DSlow<float>;
+ if (is_inplace)
+ {
+ kernels::evalTISOInplaceKernel<float>(tiso_func, broadcast_tiso_func, &kernel, options,
+ std::move(input_shape1), std::move(input_shape2));
+ }
+ else
+ {
+ kernels::TISOData kernel_data = kernel.readData();
+ kernels::evalTISOKernel<float>(tiso_func, broadcast_tiso_func, &kernel, &kernel_data,
+ options, std::move(input_shape1), std::move(input_shape2));
+ }
+ }
+ break;
+#endif // DIS_FLOAT
+ case DataType::S64:
+ {
+ auto tiso_func = luci_interpreter_pal::Add<int64_t>;
+ auto broadcast_tiso_func = luci_interpreter_pal::BroadcastAdd4DSlow<int64_t>;
+ if (is_inplace)
+ {
+ kernels::evalTISOInplaceKernel<int64_t>(tiso_func, broadcast_tiso_func, &kernel, options,
+ std::move(input_shape1), std::move(input_shape2));
+ }
+ else
+ {
+ kernels::TISOData kernel_data = kernel.readData();
+ kernels::evalTISOKernel<int64_t>(tiso_func, broadcast_tiso_func, &kernel, &kernel_data,
+ options, std::move(input_shape1), std::move(input_shape2));
+ }
+ }
+ break;
+ case DataType::S32:
+ {
+ auto tiso_func = luci_interpreter_pal::Add<int32_t>;
+ auto broadcast_tiso_func = luci_interpreter_pal::BroadcastAdd4DSlow<int32_t>;
+ if (is_inplace)
+ {
+ kernels::evalTISOInplaceKernel<int32_t>(tiso_func, broadcast_tiso_func, &kernel, options,
+ std::move(input_shape1), std::move(input_shape2));
+ }
+ else
+ {
+ kernels::TISOData kernel_data = kernel.readData();
+ kernels::evalTISOKernel<int32_t>(tiso_func, broadcast_tiso_func, &kernel, &kernel_data,
+ options, std::move(input_shape1), std::move(input_shape2));
+ }
+ }
+ break;
+ default:
+ assert(false && "Unsupported type.");
}
}
-void Add::evalQuantizedS16() const
-{
- const auto input1_scale = static_cast<double>(input1()->scale());
- const auto input2_scale = static_cast<double>(input2()->scale());
- const auto output_scale = static_cast<double>(output()->scale());
-
- constexpr int left_shift = 12;
- const double twice_max_input_scale = 2 * std::max(input1_scale, input2_scale);
- const double real_input1_multiplier = input1_scale / twice_max_input_scale;
- const double real_input2_multiplier = input2_scale / twice_max_input_scale;
- const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output_scale);
-
- int32_t input1_multiplier{}, input2_multiplier{}, output_multiplier{};
- int input1_shift{}, input2_shift{}, output_shift{};
- quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &input1_multiplier, &input1_shift);
- quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &input2_multiplier, &input2_shift);
- quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &output_multiplier, &output_shift);
-
- int32_t activation_min{};
- int32_t activation_max{};
- calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
-
- auto fn = [input1_multiplier, input1_shift, //
- input2_multiplier, input2_shift, //
- output_multiplier, output_shift, //
- activation_min, activation_max](int16_t input1_val, int16_t input2_val) {
- const int32_t shifted_input1_val = static_cast<int32_t>(input1_val) << left_shift;
- const int32_t shifted_input2_val = static_cast<int32_t>(input2_val) << left_shift;
- const int32_t scaled_input1_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier, input1_shift);
- const int32_t scaled_input2_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier, input2_shift);
- const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
- const int32_t raw_output = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sum, output_multiplier, output_shift);
- const int32_t clamped_output = std::min(activation_max, std::max(activation_min, raw_output));
- return static_cast<int16_t>(clamped_output);
- };
-
- BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<int16_t>(input1()),
- getTensorShape(input2()), getTensorData<int16_t>(input2()),
- getTensorShape(output()), getTensorData<int16_t>(output()), fn);
-}
-
-} // namespace kernels
} // namespace luci_interpreter