#include "kernels/Utils.h"
-#include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
+#include "PALConv2d.h"
#include <stdexcept>
#include <thread>
{
Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
- const Conv2DParams ¶ms)
- : KernelWithParams<Conv2DParams>({input, filter, bias}, {output}, params)
+ Tensor *im2col, const Conv2DParams ¶ms)
+ : KernelWithParams<Conv2DParams>({input, filter, bias}, {output, im2col}, params)
{
}
// (3) | uint8 uint8 int32 uint8 | quantized
// (4) | int8 int8 int32 int8 | quantized per channel
//
- // We only support (1) and (3) for now, and additionally the following:
+ // We only support (1), (3) and (4) for now, and additionally the following:
// | input filter bias output |
// ----+---------------------------+
// (5) | int16 int16 int64 int16 |
{
LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
}
+ else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
+ {
+ LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
+ LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
+ LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
+ static_cast<size_t>(filter()->shape().dim(0)));
+ for (auto zerop : filter()->zero_points())
+ {
+ LUCI_INTERPRETER_CHECK(zerop == 0);
+ }
+ }
else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
{
LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
_params.dilation_height_factor != 1 || _params.dilation_width_factor != 1;
const bool need_non_dilated_im2col = _params.stride_height != 1 || _params.stride_width != 1 ||
filter_height != 1 || filter_width != 1;
- const bool need_im2col =
+ _need_im2col =
input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
- if (need_im2col)
+ if (_need_im2col)
{
const int input_depth = input_shape.dim(3);
Shape im2col_shape{batches, output_height, output_width,
input_depth * filter_height * filter_width};
- try
- {
- _im2col =
- std::make_unique<Tensor>(input()->element_type(), im2col_shape, AffineQuantization{}, "");
- }
- catch (std::bad_alloc &ba)
- {
- // Failed memory allocation
- _im2col = nullptr;
- }
+ auto im2col = getOutputTensors()[1];
+ im2col->resize(im2col_shape);
+ }
+ else
+ {
+ auto im2col = getOutputTensors()[1];
+ im2col->set_allocatable(false);
}
}
evalQuantizedPerChannel();
}
break;
+ case DataType::S8:
+ evalQuantizedS8PerChannel();
+ break;
case DataType::S16:
evalQuantizedS16();
break;
default:
throw std::runtime_error("Unsupported type.");
}
- if (!!_im2col)
- _im2col->deallocate();
}
void Conv2D::evalFloat() const
params.float_activation_min = activation_min;
params.float_activation_max = activation_max;
- if (_im2col)
+ float *im2col_data = nullptr;
+ auto im2col = getOutputTensors()[1];
+ if (_need_im2col)
{
- try
- {
- tflite::optimized_ops::Conv(
- params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
- getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
- getTensorShape(output()), getTensorData<float>(output()), getTensorShape(_im2col.get()),
- getTensorData<float>(_im2col.get()));
- }
- catch (std::bad_alloc &ba)
- {
- // Failed memory allocation
- _im2col->deallocate();
-
- tflite::reference_ops::Conv(
- params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
- getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
- getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(), nullptr);
- }
+ im2col_data = im2col->data<float>();
}
- else
- tflite::reference_ops::Conv(
- params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
- getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
- getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(), nullptr);
+ luci_interpreter_pal::Conv(
+ params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
+ getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
+ getTensorShape(output()), getTensorData<float>(output()), getTensorShape(im2col), im2col_data);
}
void Conv2D::evalQuantized() const
params.quantized_activation_min = activation_min;
params.quantized_activation_max = activation_max;
- // TODO This should only be done once (although it takes only a few microseconds).
- // Also, the user should be able to adjust the number of threads.
- auto gemmlowp_context = std::make_unique<gemmlowp::GemmContext>();
- gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency()));
-
- tflite::optimized_ops::Conv(
- params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
- getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
- getTensorShape(output()), getTensorData<uint8_t>(output()), getTensorShape(_im2col.get()),
- getTensorData<uint8_t>(_im2col.get()), gemmlowp_context.get());
+ auto im2col = getOutputTensors()[1];
+ luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+ getTensorShape(filter()), getTensorData<uint8_t>(filter()),
+ getTensorShape(bias()), getTensorData<int32_t>(bias()),
+ getTensorShape(output()), getTensorData<uint8_t>(output()),
+ getTensorShape(im2col), getTensorData<uint8_t>(im2col));
}
void Conv2D::evalQuantizedPerChannel() const
}
}
+void Conv2D::evalQuantizedS8PerChannel() const
+{
+ int32_t activation_min{};
+ int32_t activation_max{};
+ calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
+
+ tflite::ConvParams params{};
+ params.padding_values.height = _padding_height;
+ params.padding_values.width = _padding_width;
+ params.stride_height = _params.stride_height;
+ params.stride_width = _params.stride_width;
+ params.dilation_height_factor = _params.dilation_height_factor;
+ params.dilation_width_factor = _params.dilation_width_factor;
+ // The kernel expects filter zero points to be negated.
+ params.input_offset = -input()->zero_point(); // Note the '-'.
+ params.weights_offset = 0; // Unused in tflite code
+ params.output_offset = output()->zero_point();
+ params.quantized_activation_min = activation_min;
+ params.quantized_activation_max = activation_max;
+
+ const std::vector<double> effective_output_scales =
+ getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+
+ std::vector<ChannelQuantMultipliers> quant_multipliers =
+ quantizeMultipliers(effective_output_scales);
+
+ std::vector<int32_t> shifts;
+ std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
+ [](ChannelQuantMultipliers cm) { return cm.shift; });
+ std::vector<int32_t> multipliers;
+ std::transform(quant_multipliers.begin(), quant_multipliers.end(),
+ std::back_inserter(multipliers),
+ [](ChannelQuantMultipliers cm) { return cm.multiplier; });
+
+ int8_t *im2col_data = nullptr;
+ auto im2col = getOutputTensors()[1];
+ if (_need_im2col)
+ {
+ im2col_data = im2col->data<int8_t>();
+ }
+
+ luci_interpreter_pal::ConvPerChannel(
+ params, multipliers.data(), shifts.data(), getTensorShape(input()),
+ getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
+ getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
+ getTensorData<int8_t>(output()), getTensorShape(im2col), im2col_data);
+}
+
void Conv2D::evalQuantizedS16() const
{
const auto *input_data = getTensorData<int16_t>(input());