}
};
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 20> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float* input_ptr, int input_ptr_increment,
+ const float* filter_ptr, float* acc_buffer_ptr) {
+ // Load the filters
+ float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+ float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+ float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+ float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+ float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ // Load the inputs
+ const float input_val = *input_ptr;
+ input_ptr += input_ptr_increment;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+ float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+ float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+ float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+ float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+ // Multiply-accumulate
+ acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+ acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+ acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+ acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+ acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+ vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+ acc_buffer_ptr += 20;
+ }
+ }
+};
+
template <>
struct FloatDepthwiseConvKernel<true, 0, 16> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
}
};
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 20> {
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const uint8* input_ptr, int16 input_offset,
+ int input_ptr_increment, const uint8* filter_ptr,
+ int16 filter_offset, int32* acc_buffer_ptr) {
+ // Load the filters, add filter_offset.
+ // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+ // We load the first 16 bytes into filter_u8_{0,1} as usual.
+ // Then we load the 8 last bytes into filter_u8_x (x for 'extra').
+ // This is redundant: the first 4 bytes of filter_u8_x are the same
+ // as the last 4 bytes of filter_u8_x.
+ uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+ uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+ uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
+ int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+ int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+ int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
+ filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+ filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+ filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++) {
+ uint8 input_u8 = *input_ptr;
+ input_ptr += input_ptr_increment;
+ int16 input = static_cast<int16>(input_u8 + input_offset);
+ // Load the accumulators from acc_buffer
+ int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+ int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+ int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+ int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+ int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+ // Multiply-accumulate
+ acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+ acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+ acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+ acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+ acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+ vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+ acc_buffer_ptr += 20;
+ }
+ }
+};
+
template <>
struct QuantizedDepthwiseConvKernel<true, 1, 8> {
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)