#ifdef __aarch64__
-inline void preload_l1_keep(const uint8* ptr) {
-#ifdef GEMMLOWP_ARM_64
- asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
- gemmlowp::Prefetch(ptr);
-#endif
-}
-
-// Implementation of quantized DepthwiseConv for 3x3 filters.
-
-// Below are helper structs to remove the use of arrays.
-// There is an llvm bug that causes significant slowdown when using arrays for
-// NEON intrinsics vector data types.
-// See: https://bugs.llvm.org/show_bug.cgi?id=34945
-
-struct Int32x8 {
- int32x4_t low, high;
-};
-
-struct Filter3x3x8 {
- int16x8_t f0, f1, f2, f3, f4, f5, f6, f7, f8;
-};
-
-// Loads 3x3 filter of depth 8 and adds filter offsets.
-inline Filter3x3x8 Load3x3Filter(const uint8* filter_ptr, int32 filter_offset,
- int output_depth) {
- Filter3x3x8 filter;
-
- uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5,
- temp_u8_6, temp_u8_7, temp_u8_8;
- int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
-
- temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth);
- temp_u8_1 = vld1_u8(filter_ptr + 1 * output_depth);
- temp_u8_2 = vld1_u8(filter_ptr + 2 * output_depth);
- temp_u8_3 = vld1_u8(filter_ptr + 3 * output_depth);
- temp_u8_4 = vld1_u8(filter_ptr + 4 * output_depth);
- temp_u8_5 = vld1_u8(filter_ptr + 5 * output_depth);
- temp_u8_6 = vld1_u8(filter_ptr + 6 * output_depth);
- temp_u8_7 = vld1_u8(filter_ptr + 7 * output_depth);
- temp_u8_8 = vld1_u8(filter_ptr + 8 * output_depth);
-
- filter.f0 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0));
- filter.f1 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1));
- filter.f2 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2));
- filter.f3 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3));
- filter.f4 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4));
- filter.f5 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5));
- filter.f6 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6));
- filter.f7 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7));
- filter.f8 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8));
-
- filter.f0 = vaddq_s16(filter.f0, filter_offset_vec);
- filter.f1 = vaddq_s16(filter.f1, filter_offset_vec);
- filter.f2 = vaddq_s16(filter.f2, filter_offset_vec);
- filter.f3 = vaddq_s16(filter.f3, filter_offset_vec);
- filter.f4 = vaddq_s16(filter.f4, filter_offset_vec);
- filter.f5 = vaddq_s16(filter.f5, filter_offset_vec);
- filter.f6 = vaddq_s16(filter.f6, filter_offset_vec);
- filter.f7 = vaddq_s16(filter.f7, filter_offset_vec);
- filter.f8 = vaddq_s16(filter.f8, filter_offset_vec);
-
- return filter;
-}
-
-// Applies activation, offset and downquantize on a set of accumulator
-// registers that correspond to a 2x2 output of depth 8.
-// Stores results to output.
-inline void DownquantizeAndStore2x2Output(
- Int32x8 acc_0, Int32x8 acc_1, Int32x8 acc_2, Int32x8 acc_3,
- int32 output_offset, int32 output_multiplier, int output_shift,
- int32 output_activation_min, int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- using gemmlowp::RoundingDivideByPOT;
- const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
- const int32x4_t output_activation_min_vec =
- vdupq_n_s32(output_activation_min);
- const int32x4_t output_activation_max_vec =
- vdupq_n_s32(output_activation_max);
-
- // Fixed-point multiplication.
- acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
- acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
- acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
- acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
- acc_2.low = vqrdmulhq_n_s32(acc_2.low, output_multiplier);
- acc_2.high = vqrdmulhq_n_s32(acc_2.high, output_multiplier);
- acc_3.low = vqrdmulhq_n_s32(acc_3.low, output_multiplier);
- acc_3.high = vqrdmulhq_n_s32(acc_3.high, output_multiplier);
-
- acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
- acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
- acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
- acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
- acc_2.low = RoundingDivideByPOT(acc_2.low, output_shift);
- acc_2.high = RoundingDivideByPOT(acc_2.high, output_shift);
- acc_3.low = RoundingDivideByPOT(acc_3.low, output_shift);
- acc_3.high = RoundingDivideByPOT(acc_3.high, output_shift);
-
- // Add the output offset.
- acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
- acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
- acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
- acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
- acc_2.low = vaddq_s32(acc_2.low, output_offset_vec);
- acc_2.high = vaddq_s32(acc_2.high, output_offset_vec);
- acc_3.low = vaddq_s32(acc_3.low, output_offset_vec);
- acc_3.high = vaddq_s32(acc_3.high, output_offset_vec);
-
- // Apply the activation function.
- acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
- acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
- acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
- acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
- acc_2.low = vmaxq_s32(acc_2.low, output_activation_min_vec);
- acc_2.high = vmaxq_s32(acc_2.high, output_activation_min_vec);
- acc_3.low = vmaxq_s32(acc_3.low, output_activation_min_vec);
- acc_3.high = vmaxq_s32(acc_3.high, output_activation_min_vec);
-
- acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
- acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
- acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
- acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
- acc_2.low = vminq_s32(acc_2.low, output_activation_max_vec);
- acc_2.high = vminq_s32(acc_2.high, output_activation_max_vec);
- acc_3.low = vminq_s32(acc_3.low, output_activation_max_vec);
- acc_3.high = vminq_s32(acc_3.high, output_activation_max_vec);
-
- // Saturating cast to uint8 and store to destination.
- int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
- int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
- int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
- int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
- int16x4_t acc_2_low_s16 = vqmovn_s32(acc_2.low);
- int16x4_t acc_2_high_s16 = vqmovn_s32(acc_2.high);
- int16x4_t acc_3_low_s16 = vqmovn_s32(acc_3.low);
- int16x4_t acc_3_high_s16 = vqmovn_s32(acc_3.high);
-
- int16x8_t res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
- int16x8_t res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
- int16x8_t res_2_s16 = vcombine_s16(acc_2_low_s16, acc_2_high_s16);
- int16x8_t res_3_s16 = vcombine_s16(acc_3_low_s16, acc_3_high_s16);
-
- uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
- uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
- uint8x8_t res_2_u8 = vqmovun_s16(res_2_s16);
- uint8x8_t res_3_u8 = vqmovun_s16(res_3_s16);
-
- vst1_u8(output_ptr, res_0_u8);
- vst1_u8(output_ptr + output_depth, res_1_u8);
- vst1_u8(output_ptr + output_depth * output_width, res_2_u8);
- vst1_u8(output_ptr + output_depth * output_width + output_depth, res_3_u8);
-}
-
-inline void DownquantizeAndStore(Int32x8 acc, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max,
- uint8* output_ptr) {
- using gemmlowp::RoundingDivideByPOT;
- const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
- const int32x4_t output_activation_min_vec =
- vdupq_n_s32(output_activation_min);
- const int32x4_t output_activation_max_vec =
- vdupq_n_s32(output_activation_max);
-
- acc.low = vqrdmulhq_n_s32(acc.low, output_multiplier);
- acc.high = vqrdmulhq_n_s32(acc.high, output_multiplier);
-
- acc.low = RoundingDivideByPOT(acc.low, output_shift);
- acc.high = RoundingDivideByPOT(acc.high, output_shift);
-
- acc.low = vaddq_s32(acc.low, output_offset_vec);
- acc.high = vaddq_s32(acc.high, output_offset_vec);
-
- acc.low = vmaxq_s32(acc.low, output_activation_min_vec);
- acc.high = vmaxq_s32(acc.high, output_activation_min_vec);
-
- acc.low = vminq_s32(acc.low, output_activation_max_vec);
- acc.high = vminq_s32(acc.high, output_activation_max_vec);
-
- int16x4_t acc_low_s16 = vqmovn_s32(acc.low);
- int16x4_t acc_high_s16 = vqmovn_s32(acc.high);
-
- int16x8_t res_s16 = vcombine_s16(acc_low_s16, acc_high_s16);
- uint8x8_t res_u8 = vqmovun_s16(res_s16);
- vst1_u8(output_ptr, res_u8);
-}
-
-inline void DownquantizeAndStore2Output(
- Int32x8 acc_0, Int32x8 acc_1, int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min, int32 output_activation_max,
- uint8* output_ptr, int output_ptr_offset) {
- {
- using gemmlowp::RoundingDivideByPOT;
- const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
- const int32x4_t output_activation_min_vec =
- vdupq_n_s32(output_activation_min);
- const int32x4_t output_activation_max_vec =
- vdupq_n_s32(output_activation_max);
-
- // Fixed-point multiplication.
- acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
- acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
- acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
- acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
-
- acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
- acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
- acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
- acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
-
- // Add the output offset.
- acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
- acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
- acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
- acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
-
- // Apply the activation function.
- acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
- acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
- acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
- acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
-
- acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
- acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
- acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
- acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
- }
-
- // Saturating cast to uint8 and store to destination.
- int16x8_t res_0_s16;
- {
- int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
- int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
- res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
- }
-
- int16x8_t res_1_s16;
- {
- int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
- int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
- res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
- }
-
- uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
- uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
- vst1_u8(output_ptr, res_0_u8);
- vst1_u8(output_ptr + output_ptr_offset, res_1_u8);
-}
-
-// Performs multiply accumulate on 3 inputs of depth 8.
-inline Int32x8 MultiplyAccumulateRow(Int32x8 accum, int16x8_t f0, int16x8_t f1,
- int16x8_t f2, int16x8_t i0, int16x8_t i1,
- int16x8_t i2) {
- accum.low = vmlal_s16(accum.low, vget_low_s16(f0), vget_low_s16(i0));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f0), vget_high_s16(i0));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f1), vget_low_s16(i1));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f1), vget_high_s16(i1));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f2), vget_low_s16(i2));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f2), vget_high_s16(i2));
- return accum;
-}
-
-// Performs multiply accumulate on 3 inputs of depth 8.
-inline Int32x8 MultiplyAccumulate3x3Filter(const Filter3x3x8& f, int16x8_t i0,
- int16x8_t i1, int16x8_t i2,
- int16x8_t i3, int16x8_t i4,
- int16x8_t i5, int16x8_t i6,
- int16x8_t i7, int16x8_t i8,
- Int32x8 accum) {
- accum.low = vmlal_s16(accum.low, vget_low_s16(f.f0), vget_low_s16(i0));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f.f0), vget_high_s16(i0));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f.f1), vget_low_s16(i1));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f.f1), vget_high_s16(i1));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f.f2), vget_low_s16(i2));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f.f2), vget_high_s16(i2));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f.f3), vget_low_s16(i3));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f.f3), vget_high_s16(i3));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f.f4), vget_low_s16(i4));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f.f4), vget_high_s16(i4));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f.f5), vget_low_s16(i5));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f.f5), vget_high_s16(i5));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f.f6), vget_low_s16(i6));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f.f6), vget_high_s16(i6));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f.f7), vget_low_s16(i7));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f.f7), vget_high_s16(i7));
- accum.low = vmlal_s16(accum.low, vget_low_s16(f.f8), vget_low_s16(i8));
- accum.high = vmlal_s16(accum.high, vget_high_s16(f.f8), vget_high_s16(i8));
- return accum;
-}
-
-inline void DotProductAndStore(const Filter3x3x8& filter, int16x8_t i0,
- int16x8_t i1, int16x8_t i2, int16x8_t i3,
- int16x8_t i4, int16x8_t i5, int16x8_t i6,
- int16x8_t i7, int16x8_t i8,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr) {
- Int32x8 acc;
- acc.low = vld1q_s32(bias_ptr);
- acc.high = vld1q_s32(bias_ptr + 4);
-
- acc = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7, i8,
- acc);
-
- DownquantizeAndStore(acc, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max,
- output_ptr);
-}
-
-// Performs multiply-accumulate on a 3x4 input for 2 horizontal outputs.
-inline void DotProductAndStore2xStride1(
- const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
- int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
- int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
- const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min, int32 output_activation_max,
- uint8* output_ptr, int output_ptr_offset) {
- Int32x8 acc_0, acc_1;
- acc_0.low = vld1q_s32(bias_ptr);
- acc_1.low = vld1q_s32(bias_ptr);
- acc_0.high = vld1q_s32(bias_ptr + 4);
- acc_1.high = vld1q_s32(bias_ptr + 4);
-
- acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i4, i5, i6, i8, i9,
- i10, acc_0);
- acc_1 = MultiplyAccumulate3x3Filter(filter, i1, i2, i3, i5, i6, i7, i9, i10,
- i11, acc_1);
- DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
- output_shift, output_activation_min,
- output_activation_max, output_ptr,
- output_ptr_offset);
-}
-
-// Performs multiply-accumulate on a 4x3 input for 2 vertical outputs.
-inline void DotProductAndStore2yStride1(
- const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
- int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
- int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
- const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
- int output_shift, int32 output_activation_min, int32 output_activation_max,
- uint8* output_ptr, int output_ptr_offset) {
- Int32x8 acc_0, acc_1;
- acc_0.low = vld1q_s32(bias_ptr);
- acc_1.low = vld1q_s32(bias_ptr);
- acc_0.high = vld1q_s32(bias_ptr + 4);
- acc_1.high = vld1q_s32(bias_ptr + 4);
-
- acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7,
- i8, acc_0);
- acc_1 = MultiplyAccumulate3x3Filter(filter, i3, i4, i5, i6, i7, i8, i9, i10,
- i11, acc_1);
- DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
- output_shift, output_activation_min,
- output_activation_max, output_ptr,
- output_ptr_offset);
-}
-
-// A kernel that is optimized on the number of output cells in the x and y
-// direction, and the stride. Assumes 3x3 filters of 8 depth.
-template <int kFixedOutputY, int kFixedOutputX, int kFixedStrideWidth,
- int kFixedStrideHeight>
-struct ConvKernel3x3FilterDepth8 {};
-
-template <>
-struct ConvKernel3x3FilterDepth8<8, 8, 1, 1> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- const int output_row_size = output_depth * output_width;
-
- // To process 8x8 outputs using a 3x3 filter, we require 10x10 inputs.
- // Load inputs for the first 2 filters on the top left, then slide to
- // the right, down, left, down, right, etc. in a snake-like path. This
- // minimizes the total number of loads.
- //
- // INPUT OUTPUT
- // |\----------------\ |\------------\
- // | \ \ | \ \
- // | \----------------\ | \------------\
- // | | 0 ... 9 | | | 0 ... 7 |
- // | | 10 ... 19 | ---> | | 8 ... 15 |
- // | | 20 ... 29 | \ | .. ... .. |
- // \ | .. ... .. | \| 56 ... 63 |
- // \| 90 ... 109 | |------------|
- // |----------------|
- //
- // The first set of loads corresponds to:
- //
- // INPUT OUTPUT
- // |\----------------- |\-----------
- // | \ | \
- // | \----------------- | \----------
- // | | 0 1 2 3 ... | | 0 1 ...
- // | | 10 11 12 13 ... ---> | | .. ...
- // | | 20 21 22 23 ... | .. ...
- // | | .. ... ...
- //
- // The next set of loads correspond to a sliding window to the right.
- // It loads inputs 4, 5, 14, 15, 23, 24 and keeps 2, 3, 12, 13, and 22:
- //
- // INPUT OUTPUT
- // |\------------------- |\-------------
- // | \ | \
- // | \------------------- | \------------
- // | | .. 2 3 4 5 ... | | .. 2 3 ...
- // | | .. 12 13 14 15 ... ---> | | .. ...
- // | | .. 21 22 23 24 ... | .. ...
- // | | .. ... ...
- //
- // And so on...
-
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11;
-
- // Load inputs for 1x2 outputs starting from the top left. Referring to the
- // indexes in the diagram above, this corresponds to outputs (0) and (1).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth);
-
- // Slide to the right for outputs x = [2, 3], y = 0. Referring to the
- // indexes in the diagram above, this corresponds to outputs (2) and (3).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_depth, output_depth);
-
- // Slide to the right again for outputs x = [4, 5], y = 0. Referring to the
- // indexes in the diagram above, this corresponds to outputs (4) and (5).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 6 * input_depth;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 4 * output_depth, output_depth);
-
- // Slide to the right one last time for outputs x = [6, 7], y = 0.
- // Referring to the indexes in the diagram above, this corresponds to
- // outputs (6) and (7).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 8 * input_depth;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 6 * output_depth, output_depth);
-
- // Slide to down for outputs x = [6, 7], y = 1. Referring to the indexes in
- // the diagram above, this corresponds to outputs (14) and (15).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 6 * input_depth + 3 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
- input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 6 * output_depth + output_row_size,
- output_depth);
-
- // Slide left for outputs x = [4, 5], y = 1. Referring to the indexes in
- // the diagram above, this corresponds to outputs (12) and (13).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth + input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
- input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 4 * output_depth + output_row_size,
- output_depth);
-
- // Slide left again for outputs x = [2, 3], y = 1. Referring to the indexes
- // in the diagram above, this corresponds to outputs (10) and (11).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 2 * input_depth + input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
- input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_depth + output_row_size,
- output_depth);
-
- // Slide left one more time for outputs x = [0, 1], y = 1. Referring to the
- // indexes in the diagram above, this corresponds to outputs (8) and (9).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
- input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + output_row_size, output_depth);
-
- // Slide down for outputs x = [0, 1], y = 2. Referring to the
- // indexes in the diagram above, this corresponds to outputs (16) and (17).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 4 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
- input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_row_size, output_depth);
-
- // Slide right for outputs x = [2, 3], y = 2. Referring to the
- // indexes in the diagram above, this corresponds to outputs (18) and (19).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
- input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
-
- // Slide right for outputs x = [4, 5], y = 2. Referring to the
- // indexes in the diagram above, this corresponds to outputs (20) and (21).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 6 * input_depth + 2 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
- input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 4 * output_depth + 2 * output_row_size, output_depth);
-
- // Slide right one more time for outputs x = [6, 7], y = 2. Referring to the
- // indexes in the diagram above, this corresponds to outputs (22) and (23).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 8 * input_depth + 2 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
- input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 6 * output_depth + 2 * output_row_size, output_depth);
-
- // Slide down for outputs x = [6, 7], y = 3. Referring to the indexes in
- // the diagram above, this corresponds to outputs (30) and (31).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 6 * input_depth + 5 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 6 * output_depth + 3 * output_row_size, output_depth);
-
- // Slide left for outputs x = [4, 5], y = 3. Referring to the indexes in
- // the diagram above, this corresponds to outputs (28) and (29).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth + 3 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 4 * output_depth + 3 * output_row_size, output_depth);
-
- // Slide left for outputs x = [2, 3], y = 3. Referring to the indexes in
- // the diagram above, this corresponds to outputs (26) and (27).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
-
- // Slide left one more time for outputs x = [0, 1], y = 3. Referring to the
- // indexes in the diagram above, this corresponds to outputs (24) and (25).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 3 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 3 * output_row_size, output_depth);
-
- // Slide down for outputs x = [0, 1], y = 4. Referring to the indexes in
- // the diagram above, this corresponds to outputs (32) and (33).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 6 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
- input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 4 * output_row_size, output_depth);
-
- // Slide right for outputs x = [2, 3], y = 4. Referring to the indexes in
- // the diagram above, this corresponds to outputs (34) and (35).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth + 4 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
+// clang-format gets confused with this file and ends up formatting lines to
+// be larger than 80 characters. Turn off here and back on at the end of the
+// file.
- DotProductAndStore2xStride1(
- filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
- input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 2 * output_depth + 4 * output_row_size, output_depth);
-
- // Slide right for outputs x = [4, 5], y = 4. Referring to the indexes in
- // the diagram above, this corresponds to outputs (36) and (37).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 6 * input_depth + 4 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
- input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 4 * output_depth + 4 * output_row_size, output_depth);
-
- // Slide right one more time for outputs x = [6, 7], y = 4. Referring to the
- // indexes in the diagram above, this corresponds to outputs (38) and (39).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 8 * input_depth + 4 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
- input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 6 * output_depth + 4 * output_row_size, output_depth);
-
- // Slide down for outputs x = [6, 7], y = 5. Referring to the indexes in
- // the diagram above, this corresponds to outputs (46) and (47).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 6 * input_depth + 7 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
- input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 6 * output_depth + 5 * output_row_size, output_depth);
-
- // Slide left for outputs x = [4, 5], y = 5. Referring to the indexes in
- // the diagram above, this corresponds to outputs (44) and (45).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth + 5 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
- input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 4 * output_depth + 5 * output_row_size, output_depth);
-
- // Slide left for outputs x = [2, 3], y = 5. Referring to the indexes in
- // the diagram above, this corresponds to outputs (42) and (43).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
- input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 2 * output_depth + 5 * output_row_size, output_depth);
-
- // Slide left one more time for outputs x = [0, 1], y = 5. Referring to the
- // indexes in the diagram above, this corresponds to outputs (40) and (41).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 5 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
- input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 5 * output_row_size, output_depth);
-
- // Slide down for outputs x = [0, 1], y = 6. Referring to the indexes in
- // the diagram above, this corresponds to outputs (48) and (49).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 8 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 6 * output_row_size, output_depth);
-
- // Slide right for outputs x = [2, 3], y = 6. Referring to the indexes in
- // the diagram above, this corresponds to outputs (50) and (51).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth + 6 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 2 * output_depth + 6 * output_row_size, output_depth);
-
- // Slide right for outputs x = [4, 5], y = 6. Referring to the indexes in
- // the diagram above, this corresponds to outputs (52) and (53).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 6 * input_depth + 6 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 4 * output_depth + 6 * output_row_size, output_depth);
-
- // Slide right one more time for outputs x = [6, 7], y = 6. Referring to the
- // indexes in the diagram above, this corresponds to outputs (54) and (55).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 8 * input_depth + 6 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 6 * output_depth + 6 * output_row_size, output_depth);
-
- // Slide down for outputs x = [6, 7], y = 7. Referring to the indexes in the
- // diagram above, this corresponds to outputs (62) and (63).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 6 * input_depth + 9 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
- input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 6 * output_depth + 7 * output_row_size, output_depth);
-
- // Slide left for outputs x = [4, 5], y = 7. Referring to the indexes in the
- // diagram above, this corresponds to outputs (60) and (61).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth + 7 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
- input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 4 * output_depth + 7 * output_row_size, output_depth);
-
- // Slide left for outputs x = [2, 3], y = 7. Referring to the indexes in the
- // diagram above, this corresponds to outputs (58) and (59).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 2 * input_depth + 7 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
- input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 2 * output_depth + 7 * output_row_size, output_depth);
-
- // Slide left one more time for outputs x = [0, 1], y = 7. Referring to the
- // indexes in the diagram above, this corresponds to outputs (56) and (57).
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 7 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
- input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 7 * output_row_size, output_depth);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<4, 4, 1, 1> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- const int output_row_size = output_depth * output_width;
-
- // To process 4x4 outputs using a 3x3 filter, we require 6x6 inputs.
- // Load inputs for the first 2 filters on the top left, then slide to
- // the right, down, left, down, right, etc. in a snake-like path. This
- // minimizes the total number of loads.
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11;
-
- // Load inputs for 1x2 outputs starting from the top left.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth);
-
- // Now load 1x2 inputs on the top right.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_depth, output_depth);
-
- // Now load next inputs when sliding window down.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
- input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_depth + output_row_size,
- output_depth);
-
- // Now load next inputs when sliding window left.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
- input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + output_row_size, output_depth);
-
- // Now load next inputs when sliding window down.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 4 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- }
+// clang-format off
- DotProductAndStore2xStride1(
- filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
- input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_row_size, output_depth);
-
- // Now load next inputs when sliding window right.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
- input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
-
- // Now load next inputs when sliding window down.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max,
- output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
-
- // Now load next inputs when sliding window left.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 3 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 3 * output_row_size, output_depth);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<4, 2, 1, 1> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- const int output_row_size = output_depth * output_width;
-
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11;
-
- // Load inputs for 1x2 outputs starting from the top.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth);
-
- output_ptr += output_row_size;
-
- // Now load next inputs one row down.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 3 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
- input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth);
-
- output_ptr += output_row_size;
-
- // Now load next row.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 4 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
- input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth);
-
- output_ptr += output_row_size;
-
- // Now load last row.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 5 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<4, 1, 1, 1> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- const int output_row_size = output_depth * output_width;
-
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11;
-
- // Load inputs for 2x1 outputs starting from the top.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
-
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2yStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_row_size);
-
- // Load inputs for bottom 2 rows.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- }
-
- DotProductAndStore2yStride1(
- filter, input_6, input_7, input_8, input_9, input_10, input_11, input_0,
- input_1, input_2, input_3, input_4, input_5, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_row_size,
- output_row_size);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<2, 2, 1, 1> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- Int32x8 acc_0, acc_1, acc_2, acc_3;
-
- acc_0.low = vld1q_s32(bias_ptr);
- acc_1.low = vld1q_s32(bias_ptr);
- acc_2.low = vld1q_s32(bias_ptr);
- acc_3.low = vld1q_s32(bias_ptr);
-
- bias_ptr += 4;
- acc_0.high = vld1q_s32(bias_ptr);
- acc_1.high = vld1q_s32(bias_ptr);
- acc_2.high = vld1q_s32(bias_ptr);
- acc_3.high = vld1q_s32(bias_ptr);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-
- // Add scope for input registers to help the compiler know that it is
- // not needed.
- {
- // To process 2x2 outputs using a 3x3 filter, we require 4x4 inputs.
- // Load inputs for the top two filters first.
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11;
-
- const uint8* ptr = input_ptr;
-
- // Load top 3 rows.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- // Multiply-accum for top-left output.
- acc_0 = MultiplyAccumulate3x3Filter(filter, input_0, input_1, input_2,
- input_4, input_5, input_6, input_8,
- input_9, input_10, acc_0);
-
- // Multiply-accum for top-right output.
- acc_1 = MultiplyAccumulate3x3Filter(filter, input_1, input_2, input_3,
- input_5, input_6, input_7, input_9,
- input_10, input_11, acc_1);
-
- // Now load the bottom row.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- }
-
- // Multiply-accum for bottom-left output.
- acc_2 = MultiplyAccumulate3x3Filter(filter, input_4, input_5, input_6,
- input_8, input_9, input_10, input_0,
- input_1, input_2, acc_2);
-
- // Multiply-accum for bottom-right output.
- acc_3 = MultiplyAccumulate3x3Filter(filter, input_5, input_6, input_7,
- input_9, input_10, input_11, input_1,
- input_2, input_3, acc_3);
- }
-
- DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
- output_multiplier, output_shift,
- output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<2, 4, 1, 1> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- const int output_row_size = output_depth * output_width;
-
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11;
-
- // Load inputs for 1x2 outputs starting from the top left.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth);
-
- // Now load 1x2 inputs on the top right.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + 4 * input_depth;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_depth, output_depth);
-
- // Now load next inputs when sliding window down.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
- input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_depth + output_row_size,
- output_depth);
-
- // Now load next inputs when sliding window left.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
- input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + output_row_size, output_depth);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<1, 4, 1, 1> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11;
-
- // Load inputs for 1x2 outputs starting from the left.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
- const uint8* ptr = input_ptr;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
-
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth);
-
- // Now load 1x2 inputs on the right.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr + input_depth * 4;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_2 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
-
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- DotProductAndStore2xStride1(
- filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
- input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr + 2 * output_depth, output_depth);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<2, 1, 1, 1> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- // To process 2x1 outputs using a 3x3 filter, we require 4x3 inputs.
- // Load all inputs at the beginning.
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11;
-
- // Load inputs for 1x2 outputs starting from the top left.
- {
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
- const uint8* ptr = input_ptr;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
-
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- input_10 = vaddq_s16(input_10, input_offset_vec);
- input_11 = vaddq_s16(input_11, input_offset_vec);
- }
-
- DotProductAndStore2yStride1(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth * output_width);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<4, 2, 2, 2> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- const int output_row_size = output_depth * output_width;
-
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- Int32x8 acc_0, acc_1;
- acc_0.low = vld1q_s32(bias_ptr);
- acc_1.low = vld1q_s32(bias_ptr);
- acc_0.high = vld1q_s32(bias_ptr + 4);
- acc_1.high = vld1q_s32(bias_ptr + 4);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9;
-
- const uint8* ptr = input_ptr;
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
-
- // Load first 2 rows.
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
- input_2, input_3, input_4);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
- input_5, input_6, input_7);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
- input_7, input_8, input_9);
-
- // Load next 2 rows.
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
- input_2, input_3, input_4);
-
- DownquantizeAndStore2Output(
- acc_0, acc_1, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max, output_ptr, output_depth);
-
- output_ptr += output_row_size;
-
- // Moving onto the next row of outputs.
- acc_0.low = vld1q_s32(bias_ptr);
- acc_1.low = vld1q_s32(bias_ptr);
- acc_0.high = vld1q_s32(bias_ptr + 4);
- acc_1.high = vld1q_s32(bias_ptr + 4);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
- input_2, input_3, input_4);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
- input_5, input_6, input_7);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
- input_7, input_8, input_9);
-
- // Load next 2 rows.
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
- input_2, input_3, input_4);
-
- DownquantizeAndStore2Output(
- acc_0, acc_1, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max, output_ptr, output_depth);
-
- output_ptr += output_row_size;
-
- // Moving onto the next row of outputs.
- acc_0.low = vld1q_s32(bias_ptr);
- acc_1.low = vld1q_s32(bias_ptr);
- acc_0.high = vld1q_s32(bias_ptr + 4);
- acc_1.high = vld1q_s32(bias_ptr + 4);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
- input_2, input_3, input_4);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
- input_5, input_6, input_7);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
- input_7, input_8, input_9);
-
- // Load next 2 rows.
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
- input_2, input_3, input_4);
-
- DownquantizeAndStore2Output(
- acc_0, acc_1, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max, output_ptr, output_depth);
-
- output_ptr += output_row_size;
-
- // Moving onto the next row of outputs.
- acc_0.low = vld1q_s32(bias_ptr);
- acc_1.low = vld1q_s32(bias_ptr);
- acc_0.high = vld1q_s32(bias_ptr + 4);
- acc_1.high = vld1q_s32(bias_ptr + 4);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
- input_2, input_3, input_4);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
- input_5, input_6, input_7);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
- input_7, input_8, input_9);
-
- // Load last row.
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
- input_2, input_3, input_4);
-
- DownquantizeAndStore2Output(
- acc_0, acc_1, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max, output_ptr, output_depth);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<4, 4, 2, 2> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- // Reuse 4x2 kernel twice.
- ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
- input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max, output_ptr, output_depth,
- output_width);
-
- ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
- input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
- filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr + 2 * output_depth, output_depth, output_width);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<4, 1, 2, 2> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- const int output_row_size = output_depth * output_width;
-
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8;
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
- temp_8;
-
- const uint8* ptr = input_ptr;
-
- // Load all inputs for top output.
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_6 = vld1_u8(ptr);
- temp_7 = vld1_u8(ptr + input_depth);
- temp_8 = vld1_u8(ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
-
- DotProductAndStore(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
-
- // Second output.
- output_ptr += output_row_size;
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
-
- DotProductAndStore(
- filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
- input_4, input_5, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
-
- // Third output.
- output_ptr += output_row_size;
-
- ptr += input_row_size;
- temp_6 = vld1_u8(ptr);
- temp_7 = vld1_u8(ptr + input_depth);
- temp_8 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
-
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
-
- DotProductAndStore(
- filter, input_3, input_4, input_5, input_6, input_7, input_8, input_0,
- input_1, input_2, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
-
- // Fourth output.
- output_ptr += output_row_size;
-
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_6 = vld1_u8(ptr);
- temp_7 = vld1_u8(ptr + input_depth);
- temp_8 = vld1_u8(ptr + 2 * input_depth);
-
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
-
- DotProductAndStore(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
- }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<2, 2, 2, 2> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- Int32x8 acc_0, acc_1, acc_2, acc_3;
- acc_0.low = vld1q_s32(bias_ptr);
- acc_1.low = vld1q_s32(bias_ptr);
- acc_2.low = vld1q_s32(bias_ptr);
- acc_3.low = vld1q_s32(bias_ptr);
-
- bias_ptr += 4;
- acc_0.high = vld1q_s32(bias_ptr);
- acc_1.high = vld1q_s32(bias_ptr);
- acc_2.high = vld1q_s32(bias_ptr);
- acc_3.high = vld1q_s32(bias_ptr);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-
- // Add scope for input registers to help the compiler know that it is
- // not needed.
- {
- // To process 2x2 outputs using a 3x3 filter at stride 2, we require
- // 5x5 inputs. We load the first 5x2 inputs at a time.
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, input_9;
-
- const uint8* ptr = input_ptr;
-
- // Load inputs.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
-
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
- input_2, input_3, input_4);
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
- input_5, input_6, input_7);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
- input_7, input_8, input_9);
-
- // Load next inputs.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_9 = vaddq_s16(input_9, input_offset_vec);
- }
-
- acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
- input_0, input_1, input_2);
-
- acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
- input_2, input_3, input_4);
-
- // Moving onto the two bottom outputs.
- acc_2 = MultiplyAccumulateRow(acc_2, filter.f0, filter.f1, filter.f2,
- input_0, input_1, input_2);
-
- acc_3 = MultiplyAccumulateRow(acc_3, filter.f0, filter.f1, filter.f2,
- input_2, input_3, input_4);
-
- acc_2 = MultiplyAccumulateRow(acc_2, filter.f3, filter.f4, filter.f5,
- input_5, input_6, input_7);
-
- acc_3 = MultiplyAccumulateRow(acc_3, filter.f3, filter.f4, filter.f5,
- input_7, input_8, input_9);
-
- // Load last input row.
- {
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- temp_3 = vld1_u8(ptr + 3 * input_depth);
- temp_4 = vld1_u8(ptr + 4 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- }
-
- acc_2 = MultiplyAccumulateRow(acc_2, filter.f6, filter.f7, filter.f8,
- input_0, input_1, input_2);
-
- acc_3 = MultiplyAccumulateRow(acc_3, filter.f6, filter.f7, filter.f8,
- input_2, input_3, input_4);
- }
+#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
- DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
- output_multiplier, output_shift,
- output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
- }
+// Encapsulates constant parameters used in DepthwiseConv.
+// 64-bit is used for types that will be added to 64-bit addresses in asm.
+struct DepthwiseConvParams {
+ int64_t input_depth;
+ int64_t input_row_size;
+ int64_t output_depth;
+ int64_t output_row_size;
+ int32 input_offset;
+ int32 output_offset;
+ int32 filter_offset;
+ int32 output_multiplier;
+ int32 output_activation_min;
+ int32 output_activation_max;
+ int32 output_shift;
+ int32 input_width;
+ int32 input_height;
+ int32 output_width;
+ int32 output_height;
};
-template <>
-struct ConvKernel3x3FilterDepth8<2, 4, 2, 2> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- // Reuse 2x2 kernel twice.
- ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
- input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max, output_ptr, output_depth,
- output_width);
-
- ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
- input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
- filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr + 2 * output_depth, output_depth, output_width);
- }
-};
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
+// Represents the number of bytes offset from the start of the
+// DepthwiseConvParams struct. This is used in the asm to load parameters.
+// Keep these values in sync with the static_asserts below.
+#define OFFSET_INPUT_DEPTH 0
+#define OFFSET_INPUT_ROW_SIZE 8
+#define OFFSET_OUTPUT_DEPTH 16
+#define OFFSET_OUTPUT_ROW_SIZE 24
+#define OFFSET_INPUT_OFFSET 32
+#define OFFSET_OUTPUT_OFFSET 36
+#define OFFSET_FILTER_OFFSET 40
+#define OFFSET_OUTPUT_MULTIPLIER 44
+#define OFFSET_OUTPUT_ACTIVATION_MIN 48
+#define OFFSET_OUTPUT_ACTIVATION_MAX 52
+#define OFFSET_OUTPUT_SHIFT 56
+#define OFFSET_INPUT_WIDTH 60
+#define OFFSET_INPUT_HEIGHT 64
+#define OFFSET_OUTPUT_WIDTH 68
+#define OFFSET_OUTPUT_HEIGHT 72
+
+static_assert(offsetof(DepthwiseConvParams, input_depth) ==
+ OFFSET_INPUT_DEPTH, "");
+static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
+ OFFSET_INPUT_ROW_SIZE, "");
+static_assert(offsetof(DepthwiseConvParams, output_depth) ==
+ OFFSET_OUTPUT_DEPTH, "");
+static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
+ OFFSET_OUTPUT_ROW_SIZE, "");
+static_assert(offsetof(DepthwiseConvParams, input_offset) ==
+ OFFSET_INPUT_OFFSET, "");
+static_assert(offsetof(DepthwiseConvParams, output_offset) ==
+ OFFSET_OUTPUT_OFFSET, "");
+static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
+ OFFSET_FILTER_OFFSET, "");
+static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
+ OFFSET_OUTPUT_MULTIPLIER, "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
+ OFFSET_OUTPUT_ACTIVATION_MIN, "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
+ OFFSET_OUTPUT_ACTIVATION_MAX, "");
+static_assert(offsetof(DepthwiseConvParams, output_shift) ==
+ OFFSET_OUTPUT_SHIFT, "");
+static_assert(offsetof(DepthwiseConvParams, input_width) ==
+ OFFSET_INPUT_WIDTH, "");
+static_assert(offsetof(DepthwiseConvParams, input_height) ==
+ OFFSET_INPUT_HEIGHT, "");
+static_assert(offsetof(DepthwiseConvParams, output_width) ==
+ OFFSET_OUTPUT_WIDTH, "");
+static_assert(offsetof(DepthwiseConvParams, output_height) ==
+ OFFSET_OUTPUT_HEIGHT, "");
+
+template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight>
+struct DepthwiseConvWindow {};
template <>
-struct ConvKernel3x3FilterDepth8<2, 1, 2, 2> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- const int output_row_size = output_depth * output_width;
-
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8;
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
- temp_8;
-
- const uint8* ptr = input_ptr;
-
- // Load all inputs for top output.
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_6 = vld1_u8(ptr);
- temp_7 = vld1_u8(ptr + input_depth);
- temp_8 = vld1_u8(ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
-
- DotProductAndStore(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
-
- // Second output.
- output_ptr += output_row_size;
-
- ptr += input_row_size;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
-
- DotProductAndStore(
- filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
- input_4, input_5, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
+struct DepthwiseConvWindow<8, 1, 1> {
+ public:
+ static void Run(const uint8* input_ptr, const uint8* filter_ptr,
+ const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
+ int64_t input_row_size, int32 output_window_height,
+ int32 output_window_width,
+ const DepthwiseConvParams* params_ptr) {
+ const int64_t input_width_increment = 2 * input_depth;
+ const int64_t input_height_increment = 2 * input_row_size;
+ const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+ asm volatile(
+ // Performs depthwise convolutions for a window specified by
+ // |output_window_height| and |output_window_width|. The inner-most loop
+ // processes 2x2 outputs, and any leftovers at the end.
+ //
+ // Algorithm works as follows:
+ //
+ // 1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+ // values.
+ // 2. For 2 output heights at a time:
+ // i. For 2 output widths at a time, load inputs for a 2x1 (2
+ // height, 1 width) output window (4x3 input window).
+ // Registers v9--v20 hold input values. Mul-add with
+ // accumulators v21--v24. Then run activation, downquantize
+ // and store. Repeat for the next 2x1 output window,
+ // leveraging overlapping inputs.
+ // ii. Handle single leftover width if exists.
+ // 3. Handle single leftover height if exists.
+ // i. For 2 output widths at a time, load inputs for a 1x2 (1
+ // height, 2 width) output window (3x4 input window).
+ // Registers v9--v20 hold input values. Mul-add with
+ // accumulators v21--v24. Then run activation, downquantize
+ // and store. Repeat for the next 1x2 output window,
+ // leveraging overlapping inputs.
+ // ii. Handle single leftover width if exists.
+ //
+ // Loads are placed as soon as the register is no longer needed and
+ // interleaved with arithmetic operations to take advantage of
+ // dual-issue pipelines. We also add input offsets as far from the loads
+ // as possible to give loads enough cycles to fetch data from memory.
+
+ // Set "constant" registers. These registers may be replaced with temp
+ // values from time to time when there are not enough NEON registers.
+ // We use x9--x15 general purpose registers as they are caller-saved
+ // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf). // NOLINT
+ "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+ "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+ "cmp %w[output_window_height], #2\n"
+ "dup v26.8h, w9\n"
+ "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+ "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+ "dup v27.4s, w9\n"
+ "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+ "dup v29.4s, w2\n"
+ "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+ "dup v30.4s, w4\n"
+ "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+ "dup v31.4s, w0\n"
+ "neg w9, w9\n"
+ "dup v28.4s, w9\n"
+ "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+ "add x10, %[bias_ptr], #16\n"
+ "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+ "dup v9.8h, w9\n"
+
+ // Load filters and add offsets.
+ "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
+ "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
+ "uaddw v0.8h, v9.8h, v0.8b\n"
+ "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
+ "uaddw v1.8h, v9.8h, v1.8b\n"
+ "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
+ "uaddw v2.8h, v9.8h, v2.8b\n"
+ "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
+ "uaddw v3.8h, v9.8h, v3.8b\n"
+ "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
+ "uaddw v4.8h, v9.8h, v4.8b\n"
+ "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
+ "uaddw v5.8h, v9.8h, v5.8b\n"
+ "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
+ "uaddw v6.8h, v9.8h, v6.8b\n"
+ "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
+ "uaddw v7.8h, v9.8h, v7.8b\n"
+ "uaddw v8.8h, v9.8h, v8.8b\n"
+
+ "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+ //"loop_%=:\n"
+ DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+ // This loop processes 2x2 outputs. To avoid register exhaustion,
+ // inputs for the left 2 outputs are loaded first, then the right
+ // two outputs.
+ "mov x11, %[input_ptr]\n"
+ "mov x12, x11\n"
+ "ld1 {v9.8b}, [x12], %[input_depth]\n"
+ "add x13, x11, %[input_row_size]\n"
+ "ld1 {v10.8b}, [x12], %[input_depth]\n"
+ "add x14, x13, %[input_row_size]\n"
+ "ld1 {v11.8b}, [x12], %[input_depth]\n"
+ "add x15, x14, %[input_row_size]\n"
+ "ld1 {v12.8b}, [x13], %[input_depth]\n"
+ "mov w5, %w[output_window_width]\n"
+ "ld1 {v13.8b}, [x13], %[input_depth]\n"
+ "mov x6, %[output_ptr]\n"
+ "ld1 {v14.8b}, [x13], %[input_depth]\n"
+ "add x7, %[output_ptr], x1\n"
+ "ld1 {v15.8b}, [x14], %[input_depth]\n"
+ // The height 2 / width 2 loop loads an extra 2x1 outputs (2 height,
+ // 1 width) in anticipation for the next iteration. Make sure
+ // |output_window_width| is large enough to handle the additional
+ // loads, otherwise jump to specific the appropriate label to handle
+ // smaller widths.
+ "cmp w5, #2\n"
+ "uaddw v9.8h, v26.8h, v9.8b\n"
+ "ld1 {v16.8b}, [x14], %[input_depth]\n"
+ "uaddw v10.8h, v26.8h, v10.8b\n"
+ "ld1 {v17.8b}, [x14], %[input_depth]\n"
+ "uaddw v11.8h, v26.8h, v11.8b\n"
+ "ld1 {v18.8b}, [x15], %[input_depth]\n"
+ "uaddw v12.8h, v26.8h, v12.8b\n"
+ "ld1 {v19.8b}, [x15], %[input_depth]\n"
+ "uaddw v13.8h, v26.8h, v13.8b\n"
+ "ld1 {v20.8b}, [x15], %[input_depth]\n"
+ "uaddw v14.8h, v26.8h, v14.8b\n"
+ "ld1 {v21.4s}, [%[bias_ptr]]\n"
+ "uaddw v15.8h, v26.8h, v15.8b\n"
+ "ld1 {v22.4s}, [x10]\n"
+ "uaddw v16.8h, v26.8h, v16.8b\n"
+ "ld1 {v23.4s}, [%[bias_ptr]]\n"
+ "uaddw v17.8h, v26.8h, v17.8b\n"
+ "ld1 {v24.4s}, [x10]\n"
+ "uaddw v18.8h, v26.8h, v18.8b\n"
+ "uaddw v19.8h, v26.8h, v19.8b\n"
+ "uaddw v20.8h, v26.8h, v20.8b\n"
+
+ "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+ "cmp w5, #1\n"
+ "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+ //"loop_%=:\n"
+ DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+ // Mul-add left outputs.
+ "smlal v21.4s, v0.4h, v9.4h\n"
+ "subs w5, w5, #2\n"
+ "smlal2 v22.4s, v0.8h, v9.8h\n"
+ "cmp w5, #3\n"
+ "smlal v23.4s, v0.4h, v12.4h\n"
+ "ld1 {v9.8b}, [x12]\n"
+ "smlal2 v24.4s, v0.8h, v12.8h\n"
+ "smlal v21.4s, v1.4h, v10.4h\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "smlal v23.4s, v1.4h, v13.4h\n"
+ "smlal2 v24.4s, v1.8h, v13.8h\n"
+ "smlal v21.4s, v2.4h, v11.4h\n"
+ "smlal2 v22.4s, v2.8h, v11.8h\n"
+ "smlal v23.4s, v2.4h, v14.4h\n"
+ "smlal2 v24.4s, v2.8h, v14.8h\n"
+ "smlal v21.4s, v3.4h, v12.4h\n"
+ "smlal2 v22.4s, v3.8h, v12.8h\n"
+ "ld1 {v12.8b}, [x13]\n"
+ "smlal v23.4s, v3.4h, v15.4h\n"
+ "smlal2 v24.4s, v3.8h, v15.8h\n"
+ "smlal v21.4s, v4.4h, v13.4h\n"
+ "smlal2 v22.4s, v4.8h, v13.8h\n"
+ "smlal v23.4s, v4.4h, v16.4h\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v21.4s, v5.4h, v14.4h\n"
+ "smlal2 v22.4s, v5.8h, v14.8h\n"
+ "smlal v23.4s, v5.4h, v17.4h\n"
+ "smlal2 v24.4s, v5.8h, v17.8h\n"
+ "smlal v21.4s, v6.4h, v15.4h\n"
+ "smlal2 v22.4s, v6.8h, v15.8h\n"
+ "ld1 {v15.8b}, [x14]\n"
+ "smlal v23.4s, v6.4h, v18.4h\n"
+ "smlal2 v24.4s, v6.8h, v18.8h\n"
+ "ld1 {v18.8b}, [x15]\n"
+ "smlal v21.4s, v7.4h, v16.4h\n"
+ "smlal2 v22.4s, v7.8h, v16.8h\n"
+ "smlal v23.4s, v7.4h, v19.4h\n"
+ "smlal2 v24.4s, v7.8h, v19.8h\n"
+ "smlal v21.4s, v8.4h, v17.4h\n"
+ "smlal2 v22.4s, v8.8h, v17.8h\n"
+ "smlal v23.4s, v8.4h, v20.4h\n"
+ "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v25.16b, v21.16b, v28.16b\n"
+ "and v29.16b, v22.16b, v28.16b\n"
+ "and v30.16b, v23.16b, v28.16b\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "sshr v25.4s, v25.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "dup v30.4s, w4\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "dup v31.4s, w0\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "ld1 {v22.4s}, [x10]\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "ld1 {v24.4s}, [x10]\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "uaddw v9.8h, v26.8h, v9.8b\n"
+ "st1 {v21.8b}, [x6], x3\n"
+ "uaddw v12.8h, v26.8h, v12.8b\n"
+ "st1 {v23.8b}, [x7], x3\n"
+ "uaddw v15.8h, v26.8h, v15.8b\n"
+ "ld1 {v21.4s}, [%[bias_ptr]]\n"
+ "uaddw v18.8h, v26.8h, v18.8b\n"
+ "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+ // Mul-add right outputs.
+ "smlal v21.4s, v0.4h, v10.4h\n"
+ "add x11, x11, %[input_width_increment]\n"
+ "smlal2 v22.4s, v0.8h, v10.8h\n"
+ "mov x12, x11\n"
+ "smlal v23.4s, v0.4h, v13.4h\n"
+ "add x13, x11, %[input_row_size]\n"
+ "smlal2 v24.4s, v0.8h, v13.8h\n"
+ "add x14, x13, %[input_row_size]\n"
+ "smlal v21.4s, v1.4h, v11.4h\n"
+ "add x15, x14, %[input_row_size]\n"
+ "smlal2 v22.4s, v1.8h, v11.8h\n"
+ "smlal v23.4s, v1.4h, v14.4h\n"
+ "smlal2 v24.4s, v1.8h, v14.8h\n"
+ "smlal v21.4s, v2.4h, v9.4h\n"
+ "smlal2 v22.4s, v2.8h, v9.8h\n"
+ "ld1 {v9.8b}, [x12], %[input_depth]\n"
+ "smlal v23.4s, v2.4h, v12.4h\n"
+ "ld1 {v10.8b}, [x12], %[input_depth]\n"
+ "smlal2 v24.4s, v2.8h, v12.8h\n"
+ "ld1 {v11.8b}, [x12], %[input_depth]\n"
+ "smlal v21.4s, v3.4h, v13.4h\n"
+ "smlal2 v22.4s, v3.8h, v13.8h\n"
+ "smlal v23.4s, v3.4h, v16.4h\n"
+ "smlal2 v24.4s, v3.8h, v16.8h\n"
+ "smlal v21.4s, v4.4h, v14.4h\n"
+ "smlal2 v22.4s, v4.8h, v14.8h\n"
+ "smlal v23.4s, v4.4h, v17.4h\n"
+ "smlal2 v24.4s, v4.8h, v17.8h\n"
+ "smlal v21.4s, v5.4h, v12.4h\n"
+ "smlal2 v22.4s, v5.8h, v12.8h\n"
+ "ld1 {v12.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v5.4h, v15.4h\n"
+ "ld1 {v13.8b}, [x13], %[input_depth]\n"
+ "smlal2 v24.4s, v5.8h, v15.8h\n"
+ "ld1 {v14.8b}, [x13], %[input_depth]\n"
+ "smlal v21.4s, v6.4h, v16.4h\n"
+ "smlal2 v22.4s, v6.8h, v16.8h\n"
+ "smlal v23.4s, v6.4h, v19.4h\n"
+ "smlal2 v24.4s, v6.8h, v19.8h\n"
+ "smlal v21.4s, v7.4h, v17.4h\n"
+ "smlal2 v22.4s, v7.8h, v17.8h\n"
+ "smlal v23.4s, v7.4h, v20.4h\n"
+ "smlal2 v24.4s, v7.8h, v20.8h\n"
+ "smlal v21.4s, v8.4h, v15.4h\n"
+ "smlal2 v22.4s, v8.8h, v15.8h\n"
+ "ld1 {v15.8b}, [x14], %[input_depth]\n"
+ "smlal v23.4s, v8.4h, v18.4h\n"
+ "ld1 {v16.8b}, [x14], %[input_depth]\n"
+ "smlal2 v24.4s, v8.8h, v18.8h\n"
+ "ld1 {v17.8b}, [x14], %[input_depth]\n"
+
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "ld1 {v18.8b}, [x15], %[input_depth]\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "ld1 {v19.8b}, [x15], %[input_depth]\n"
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "ld1 {v20.8b}, [x15], %[input_depth]\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v25.16b, v21.16b, v28.16b\n"
+ "and v29.16b, v22.16b, v28.16b\n"
+ "and v30.16b, v23.16b, v28.16b\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "sshr v25.4s, v25.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "dup v30.4s, w4\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "dup v31.4s, w0\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "ld1 {v22.4s}, [x10]\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "ld1 {v24.4s}, [x10]\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "uaddw v9.8h, v26.8h, v9.8b\n"
+ "st1 {v21.8b}, [x6], x3\n"
+ "uaddw v10.8h, v26.8h, v10.8b\n"
+ "st1 {v23.8b}, [x7], x3\n"
+ "uaddw v11.8h, v26.8h, v11.8b\n"
+ "uaddw v12.8h, v26.8h, v12.8b\n"
+ "uaddw v13.8h, v26.8h, v13.8b\n"
+ "uaddw v14.8h, v26.8h, v14.8b\n"
+ "uaddw v15.8h, v26.8h, v15.8b\n"
+ "ld1 {v21.4s}, [%[bias_ptr]]\n"
+ "uaddw v16.8h, v26.8h, v16.8b\n"
+ "ld1 {v23.4s}, [%[bias_ptr]]\n"
+ "uaddw v17.8h, v26.8h, v17.8b\n"
+ "uaddw v18.8h, v26.8h, v18.8b\n"
+ "uaddw v19.8h, v26.8h, v19.8b\n"
+ "uaddw v20.8h, v26.8h, v20.8b\n"
+
+ "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+ // At this point, there will be one of 2 width or 1 width leftover,
+ // not both.
+ "cmp w5, #2\n"
+ "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+ // Handle last 2 columns if exists.
+ DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+ // Mul-add left outputs.
+ "smlal v21.4s, v0.4h, v9.4h\n"
+ "smlal2 v22.4s, v0.8h, v9.8h\n"
+ "smlal v23.4s, v0.4h, v12.4h\n"
+ "ld1 {v9.8b}, [x12]\n"
+ "smlal2 v24.4s, v0.8h, v12.8h\n"
+ "smlal v21.4s, v1.4h, v10.4h\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "smlal v23.4s, v1.4h, v13.4h\n"
+ "smlal2 v24.4s, v1.8h, v13.8h\n"
+ "smlal v21.4s, v2.4h, v11.4h\n"
+ "smlal2 v22.4s, v2.8h, v11.8h\n"
+ "smlal v23.4s, v2.4h, v14.4h\n"
+ "smlal2 v24.4s, v2.8h, v14.8h\n"
+ "smlal v21.4s, v3.4h, v12.4h\n"
+ "smlal2 v22.4s, v3.8h, v12.8h\n"
+ "ld1 {v12.8b}, [x13]\n"
+ "smlal v23.4s, v3.4h, v15.4h\n"
+ "smlal2 v24.4s, v3.8h, v15.8h\n"
+ "smlal v21.4s, v4.4h, v13.4h\n"
+ "smlal2 v22.4s, v4.8h, v13.8h\n"
+ "smlal v23.4s, v4.4h, v16.4h\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v21.4s, v5.4h, v14.4h\n"
+ "smlal2 v22.4s, v5.8h, v14.8h\n"
+ "smlal v23.4s, v5.4h, v17.4h\n"
+ "smlal2 v24.4s, v5.8h, v17.8h\n"
+ "smlal v21.4s, v6.4h, v15.4h\n"
+ "smlal2 v22.4s, v6.8h, v15.8h\n"
+ "ld1 {v15.8b}, [x14]\n"
+ "smlal v23.4s, v6.4h, v18.4h\n"
+ "smlal2 v24.4s, v6.8h, v18.8h\n"
+ "ld1 {v18.8b}, [x15]\n"
+ "smlal v21.4s, v7.4h, v16.4h\n"
+ "smlal2 v22.4s, v7.8h, v16.8h\n"
+ "smlal v23.4s, v7.4h, v19.4h\n"
+ "smlal2 v24.4s, v7.8h, v19.8h\n"
+ "smlal v21.4s, v8.4h, v17.4h\n"
+ "smlal2 v22.4s, v8.8h, v17.8h\n"
+ "smlal v23.4s, v8.4h, v20.4h\n"
+ "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v25.16b, v21.16b, v28.16b\n"
+ "and v29.16b, v22.16b, v28.16b\n"
+ "and v30.16b, v23.16b, v28.16b\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "sshr v25.4s, v25.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "dup v30.4s, w4\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "dup v31.4s, w0\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "ld1 {v22.4s}, [x10]\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "ld1 {v24.4s}, [x10]\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "uaddw v9.8h, v26.8h, v9.8b\n"
+ "st1 {v21.8b}, [x6], x3\n"
+ "uaddw v12.8h, v26.8h, v12.8b\n"
+ "st1 {v23.8b}, [x7], x3\n"
+ "uaddw v15.8h, v26.8h, v15.8b\n"
+ "ld1 {v21.4s}, [%[bias_ptr]]\n"
+ "uaddw v18.8h, v26.8h, v18.8b\n"
+ "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+ // Mul-add right outputs.
+ "smlal v21.4s, v0.4h, v10.4h\n"
+ "smlal2 v22.4s, v0.8h, v10.8h\n"
+ "smlal v23.4s, v0.4h, v13.4h\n"
+ "smlal2 v24.4s, v0.8h, v13.8h\n"
+ "smlal v21.4s, v1.4h, v11.4h\n"
+ "smlal2 v22.4s, v1.8h, v11.8h\n"
+ "smlal v23.4s, v1.4h, v14.4h\n"
+ "smlal2 v24.4s, v1.8h, v14.8h\n"
+ "smlal v21.4s, v2.4h, v9.4h\n"
+ "smlal2 v22.4s, v2.8h, v9.8h\n"
+ "smlal v23.4s, v2.4h, v12.4h\n"
+ "smlal2 v24.4s, v2.8h, v12.8h\n"
+ "smlal v21.4s, v3.4h, v13.4h\n"
+ "smlal2 v22.4s, v3.8h, v13.8h\n"
+ "smlal v23.4s, v3.4h, v16.4h\n"
+ "smlal2 v24.4s, v3.8h, v16.8h\n"
+ "smlal v21.4s, v4.4h, v14.4h\n"
+ "smlal2 v22.4s, v4.8h, v14.8h\n"
+ "smlal v23.4s, v4.4h, v17.4h\n"
+ "smlal2 v24.4s, v4.8h, v17.8h\n"
+ "smlal v21.4s, v5.4h, v12.4h\n"
+ "smlal2 v22.4s, v5.8h, v12.8h\n"
+ "smlal v23.4s, v5.4h, v15.4h\n"
+ "smlal2 v24.4s, v5.8h, v15.8h\n"
+ "smlal v21.4s, v6.4h, v16.4h\n"
+ "smlal2 v22.4s, v6.8h, v16.8h\n"
+ "smlal v23.4s, v6.4h, v19.4h\n"
+ "smlal2 v24.4s, v6.8h, v19.8h\n"
+ "smlal v21.4s, v7.4h, v17.4h\n"
+ "smlal2 v22.4s, v7.8h, v17.8h\n"
+ "smlal v23.4s, v7.4h, v20.4h\n"
+ "smlal2 v24.4s, v7.8h, v20.8h\n"
+ "smlal v21.4s, v8.4h, v15.4h\n"
+ "smlal2 v22.4s, v8.8h, v15.8h\n"
+ "smlal v23.4s, v8.4h, v18.4h\n"
+ "smlal2 v24.4s, v8.8h, v18.8h\n"
+
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v25.16b, v21.16b, v28.16b\n"
+ "and v29.16b, v22.16b, v28.16b\n"
+ "and v30.16b, v23.16b, v28.16b\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "sshr v25.4s, v25.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "dup v30.4s, w4\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "dup v31.4s, w0\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "st1 {v21.8b}, [x6], x3\n"
+ "st1 {v23.8b}, [x7], x3\n"
+ "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+ DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+ "smlal v21.4s, v0.4h, v9.4h\n"
+ "smlal2 v22.4s, v0.8h, v9.8h\n"
+ "smlal v23.4s, v0.4h, v12.4h\n"
+ "smlal2 v24.4s, v0.8h, v12.8h\n"
+ "smlal v21.4s, v1.4h, v10.4h\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "smlal v23.4s, v1.4h, v13.4h\n"
+ "smlal2 v24.4s, v1.8h, v13.8h\n"
+ "smlal v21.4s, v2.4h, v11.4h\n"
+ "smlal2 v22.4s, v2.8h, v11.8h\n"
+ "smlal v23.4s, v2.4h, v14.4h\n"
+ "smlal2 v24.4s, v2.8h, v14.8h\n"
+ "smlal v21.4s, v3.4h, v12.4h\n"
+ "smlal2 v22.4s, v3.8h, v12.8h\n"
+ "smlal v23.4s, v3.4h, v15.4h\n"
+ "smlal2 v24.4s, v3.8h, v15.8h\n"
+ "smlal v21.4s, v4.4h, v13.4h\n"
+ "smlal2 v22.4s, v4.8h, v13.8h\n"
+ "smlal v23.4s, v4.4h, v16.4h\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v21.4s, v5.4h, v14.4h\n"
+ "smlal2 v22.4s, v5.8h, v14.8h\n"
+ "smlal v23.4s, v5.4h, v17.4h\n"
+ "smlal2 v24.4s, v5.8h, v17.8h\n"
+ "smlal v21.4s, v6.4h, v15.4h\n"
+ "smlal2 v22.4s, v6.8h, v15.8h\n"
+ "smlal v23.4s, v6.4h, v18.4h\n"
+ "smlal2 v24.4s, v6.8h, v18.8h\n"
+ "smlal v21.4s, v7.4h, v16.4h\n"
+ "smlal2 v22.4s, v7.8h, v16.8h\n"
+ "smlal v23.4s, v7.4h, v19.4h\n"
+ "smlal2 v24.4s, v7.8h, v19.8h\n"
+ "smlal v21.4s, v8.4h, v17.4h\n"
+ "smlal2 v22.4s, v8.8h, v17.8h\n"
+ "smlal v23.4s, v8.4h, v20.4h\n"
+ "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v9.16b, v21.16b, v28.16b\n"
+ "and v12.16b, v22.16b, v28.16b\n"
+ "and v15.16b, v23.16b, v28.16b\n"
+ "and v18.16b, v24.16b, v28.16b\n"
+ "sshr v9.4s, v9.4s, #31\n"
+ "sshr v12.4s, v12.4s, #31\n"
+ "sshr v15.4s, v15.4s, #31\n"
+ "sshr v18.4s, v18.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "sqadd v23.4s, v23.4s, v15.4s\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "st1 {v21.8b}, [x6], x3\n"
+ "st1 {v23.8b}, [x7], x3\n"
+
+ DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+ "subs %w[output_window_height], %w[output_window_height], #2\n"
+ "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+ "cmp %w[output_window_height], #2\n"
+ "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+ "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+ DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+ "cmp %w[output_window_height], #1\n"
+ "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+ DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+ "mov x12, %[input_ptr]\n"
+ "ld1 {v9.8b}, [x12], %[input_depth]\n"
+ "add x13, %[input_ptr], %[input_row_size]\n"
+ "ld1 {v10.8b}, [x12], %[input_depth]\n"
+ "add x14, x13, %[input_row_size]\n"
+ "ld1 {v11.8b}, [x12], %[input_depth]\n"
+ "add x15, x14, %[input_row_size]\n"
+ "mov w5, %w[output_window_width]\n"
+ "ld1 {v13.8b}, [x13], %[input_depth]\n"
+ "mov x6, %[output_ptr]\n"
+ "ld1 {v14.8b}, [x13], %[input_depth]\n"
+ "add x7, %[output_ptr], x1\n"
+ "ld1 {v15.8b}, [x13], %[input_depth]\n"
+ // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+ // for the next iteration. Make sure |output_window_width| is large
+ // enough to handle the additional load, otherwise jump to the
+ // appropriate label to handle smaller widths.
+ "cmp w5, #2\n"
+ "ld1 {v17.8b}, [x14], %[input_depth]\n"
+ "ld1 {v18.8b}, [x14], %[input_depth]\n"
+ "ld1 {v19.8b}, [x14], %[input_depth]\n"
+ "ld1 {v21.4s}, [%[bias_ptr]]\n"
+ "ld1 {v22.4s}, [x10]\n"
+ "ld1 {v23.4s}, [%[bias_ptr]]\n"
+ "ld1 {v24.4s}, [x10]\n"
+
+ "uaddw v9.8h, v26.8h, v9.8b\n"
+ "uaddw v10.8h, v26.8h, v10.8b\n"
+ "uaddw v11.8h, v26.8h, v11.8b\n"
+ "uaddw v13.8h, v26.8h, v13.8b\n"
+ "uaddw v14.8h, v26.8h, v14.8b\n"
+ "uaddw v15.8h, v26.8h, v15.8b\n"
+ "uaddw v17.8h, v26.8h, v17.8b\n"
+ "uaddw v18.8h, v26.8h, v18.8b\n"
+ "uaddw v19.8h, v26.8h, v19.8b\n"
+
+ "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+ "cmp w5, #1\n"
+ "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+ //"loop_%=:\n"
+ DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+ // Load inputs for 3x4 input window which corresponds to a 1x2 output
+ // window.
+ "smlal v21.4s, v0.4h, v9.4h\n"
+ "ld1 {v12.8b}, [x12]\n"
+ "smlal2 v22.4s, v0.8h, v9.8h\n"
+ "ld1 {v16.8b}, [x13]\n"
+ "smlal v23.4s, v0.4h, v10.4h\n"
+ "ld1 {v20.8b}, [x14]\n"
+ "smlal2 v24.4s, v0.8h, v10.8h\n"
+ "subs w5, w5, #2\n"
+ "smlal v21.4s, v1.4h, v10.4h\n"
+ "cmp w5, #3\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
+ "smlal v23.4s, v1.4h, v11.4h\n"
+ "mov x12, %[input_ptr]\n"
+ "smlal2 v24.4s, v1.8h, v11.8h\n"
+ "ld1 {v9.8b}, [x12], %[input_depth]\n"
+ "smlal v21.4s, v2.4h, v11.4h\n"
+ "ld1 {v10.8b}, [x12], %[input_depth]\n"
+ "uaddw v12.8h, v26.8h, v12.8b\n"
+ "smlal2 v22.4s, v2.8h, v11.8h\n"
+ "ld1 {v11.8b}, [x12], %[input_depth]\n"
+ "add x13, %[input_ptr], %[input_row_size]\n"
+ "smlal v23.4s, v2.4h, v12.4h\n"
+ "add x14, x13, %[input_row_size]\n"
+ "smlal2 v24.4s, v2.8h, v12.8h\n"
+ "smlal v21.4s, v3.4h, v13.4h\n"
+ "add x15, x14, %[input_row_size]\n"
+ "smlal2 v22.4s, v3.8h, v13.8h\n"
+ "ld1 {v13.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v3.4h, v14.4h\n"
+ "smlal2 v24.4s, v3.8h, v14.8h\n"
+ "smlal v21.4s, v4.4h, v14.4h\n"
+ "smlal2 v22.4s, v4.8h, v14.8h\n"
+ "ld1 {v14.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v4.4h, v15.4h\n"
+ "smlal2 v24.4s, v4.8h, v15.8h\n"
+ "smlal v21.4s, v5.4h, v15.4h\n"
+ "uaddw v16.8h, v26.8h, v16.8b\n"
+ "smlal2 v22.4s, v5.8h, v15.8h\n"
+ "ld1 {v15.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v5.4h, v16.4h\n"
+ "smlal2 v24.4s, v5.8h, v16.8h\n"
+ "smlal v21.4s, v6.4h, v17.4h\n"
+ "smlal2 v22.4s, v6.8h, v17.8h\n"
+ "ld1 {v17.8b}, [x14], %[input_depth]\n"
+ "smlal v23.4s, v6.4h, v18.4h\n"
+ "smlal2 v24.4s, v6.8h, v18.8h\n"
+ "smlal v21.4s, v7.4h, v18.4h\n"
+ "smlal2 v22.4s, v7.8h, v18.8h\n"
+ "ld1 {v18.8b}, [x14], %[input_depth]\n"
+ "smlal v23.4s, v7.4h, v19.4h\n"
+ "smlal2 v24.4s, v7.8h, v19.8h\n"
+ "smlal v21.4s, v8.4h, v19.4h\n"
+ "uaddw v20.8h, v26.8h, v20.8b\n"
+ "smlal2 v22.4s, v8.8h, v19.8h\n"
+ "ld1 {v19.8b}, [x14], %[input_depth]\n"
+ "smlal v23.4s, v8.4h, v20.4h\n"
+ "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v25.16b, v21.16b, v28.16b\n"
+ "and v29.16b, v22.16b, v28.16b\n"
+ "and v30.16b, v23.16b, v28.16b\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "sshr v25.4s, v25.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "dup v30.4s, w4\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "dup v31.4s, w0\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "ld1 {v22.4s}, [x10]\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "ld1 {v24.4s}, [x10]\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "uaddw v9.8h, v26.8h, v9.8b\n"
+ "st1 {v21.8b}, [%[output_ptr]], x3\n"
+ "uaddw v10.8h, v26.8h, v10.8b\n"
+ "st1 {v23.8b}, [%[output_ptr]], x3\n"
+ "uaddw v11.8h, v26.8h, v11.8b\n"
+ "uaddw v12.8h, v26.8h, v12.8b\n"
+ "uaddw v13.8h, v26.8h, v13.8b\n"
+ "uaddw v14.8h, v26.8h, v14.8b\n"
+ "uaddw v15.8h, v26.8h, v15.8b\n"
+ "ld1 {v21.4s}, [%[bias_ptr]]\n"
+ "uaddw v16.8h, v26.8h, v16.8b\n"
+ "ld1 {v23.4s}, [%[bias_ptr]]\n"
+ "uaddw v17.8h, v26.8h, v17.8b\n"
+ "uaddw v18.8h, v26.8h, v18.8b\n"
+ "uaddw v19.8h, v26.8h, v19.8b\n"
+ "uaddw v20.8h, v26.8h, v20.8b\n"
+
+ "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+ // At this point, there will be one of 2 width or 1 width leftover,
+ // not both.
+ "cmp w5, #2\n"
+ "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+ // Handle last two horizontal outputs if exists.
+ DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+ "smlal v21.4s, v0.4h, v9.4h\n"
+ "ld1 {v12.8b}, [x12], %[input_depth]\n"
+ "smlal2 v22.4s, v0.8h, v9.8h\n"
+ "ld1 {v16.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v0.4h, v10.4h\n"
+ "ld1 {v20.8b}, [x14], %[input_depth]\n"
+ "smlal2 v24.4s, v0.8h, v10.8h\n"
+ "smlal v21.4s, v1.4h, v10.4h\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "smlal v23.4s, v1.4h, v11.4h\n"
+ "smlal2 v24.4s, v1.8h, v11.8h\n"
+ "smlal v21.4s, v2.4h, v11.4h\n"
+ "uaddw v12.8h, v26.8h, v12.8b\n"
+ "smlal2 v22.4s, v2.8h, v11.8h\n"
+ "smlal v23.4s, v2.4h, v12.4h\n"
+ "smlal2 v24.4s, v2.8h, v12.8h\n"
+ "smlal v21.4s, v3.4h, v13.4h\n"
+ "smlal2 v22.4s, v3.8h, v13.8h\n"
+ "smlal v23.4s, v3.4h, v14.4h\n"
+ "smlal2 v24.4s, v3.8h, v14.8h\n"
+ "smlal v21.4s, v4.4h, v14.4h\n"
+ "smlal2 v22.4s, v4.8h, v14.8h\n"
+ "smlal v23.4s, v4.4h, v15.4h\n"
+ "smlal2 v24.4s, v4.8h, v15.8h\n"
+ "smlal v21.4s, v5.4h, v15.4h\n"
+ "uaddw v16.8h, v26.8h, v16.8b\n"
+ "smlal2 v22.4s, v5.8h, v15.8h\n"
+ "smlal v23.4s, v5.4h, v16.4h\n"
+ "smlal2 v24.4s, v5.8h, v16.8h\n"
+ "smlal v21.4s, v6.4h, v17.4h\n"
+ "smlal2 v22.4s, v6.8h, v17.8h\n"
+ "smlal v23.4s, v6.4h, v18.4h\n"
+ "smlal2 v24.4s, v6.8h, v18.8h\n"
+ "smlal v21.4s, v7.4h, v18.4h\n"
+ "smlal2 v22.4s, v7.8h, v18.8h\n"
+ "smlal v23.4s, v7.4h, v19.4h\n"
+ "smlal2 v24.4s, v7.8h, v19.8h\n"
+ "smlal v21.4s, v8.4h, v19.4h\n"
+ "uaddw v20.8h, v26.8h, v20.8b\n"
+ "smlal2 v22.4s, v8.8h, v19.8h\n"
+ "smlal v23.4s, v8.4h, v20.4h\n"
+ "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v25.16b, v21.16b, v28.16b\n"
+ "and v29.16b, v22.16b, v28.16b\n"
+ "and v30.16b, v23.16b, v28.16b\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "sshr v25.4s, v25.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "dup v30.4s, w4\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "dup v31.4s, w0\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "st1 {v21.8b}, [%[output_ptr]], x3\n"
+ "st1 {v23.8b}, [%[output_ptr]], x3\n"
+ "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+ // Handle bottom right output if exists.
+ DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+ "smlal v21.4s, v0.4h, v9.4h\n"
+ "smlal2 v22.4s, v0.8h, v9.8h\n"
+ "smlal v21.4s, v1.4h, v10.4h\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "smlal v21.4s, v2.4h, v11.4h\n"
+ "smlal2 v22.4s, v2.8h, v11.8h\n"
+ "smlal v21.4s, v3.4h, v13.4h\n"
+ "smlal2 v22.4s, v3.8h, v13.8h\n"
+ "smlal v21.4s, v4.4h, v14.4h\n"
+ "smlal2 v22.4s, v4.8h, v14.8h\n"
+ "smlal v21.4s, v5.4h, v15.4h\n"
+ "smlal2 v22.4s, v5.8h, v15.8h\n"
+ "smlal v21.4s, v6.4h, v17.4h\n"
+ "smlal2 v22.4s, v6.8h, v17.8h\n"
+ "smlal v21.4s, v7.4h, v18.4h\n"
+ "smlal2 v22.4s, v7.8h, v18.8h\n"
+ "smlal v21.4s, v8.4h, v19.4h\n"
+ "smlal2 v22.4s, v8.8h, v19.8h\n"
+
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v9.16b, v21.16b, v28.16b\n"
+ "and v12.16b, v22.16b, v28.16b\n"
+ "sshr v9.4s, v9.4s, #31\n"
+ "sshr v12.4s, v12.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "st1 {v21.8b}, [%[output_ptr]]\n"
+ DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+ :
+ // Outputs.
+ [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+ [output_ptr] "+r"(output_ptr),
+ [output_window_height] "+r"(output_window_height)
+ :
+ // Inputs.
+ [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+ [input_depth] "r"(input_depth),
+ [output_window_width] "r"(output_window_width),
+ [input_width_increment] "r"(input_width_increment),
+ [input_height_increment] "r"(input_height_increment),
+ [output_height_increment] "r"(output_height_increment),
+ [params_ptr] "r"(params_ptr)
+ :
+ // Clobbers.
+ "cc", "memory",
+ // We use these NEON registers.
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+ "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+ "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+ "v30", "v31",
+ // We use these general-purpose registers.
+ "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+ "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
template <>
-struct ConvKernel3x3FilterDepth8<1, 2, 2, 2> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8;
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
- temp_8;
-
- const uint8* ptr = input_ptr;
-
- // Load all inputs for top output.
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_6 = vld1_u8(ptr);
- temp_7 = vld1_u8(ptr + input_depth);
- temp_8 = vld1_u8(ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
-
- DotProductAndStore(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
-
- // Second output.
- output_ptr += output_depth;
-
- ptr = input_ptr + 3 * input_depth;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- ptr += input_row_size;
- temp_6 = vld1_u8(ptr);
- temp_7 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
-
- DotProductAndStore(
- filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
- input_6, input_7, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
+struct DepthwiseConvWindow<8, 2, 2> {
+ static void Run(const uint8* input_ptr, const uint8* filter_ptr,
+ const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
+ int64_t input_row_size, int32 output_window_height,
+ int32 output_window_width,
+ const DepthwiseConvParams* params_ptr) {
+ const int64_t input_width_increment = 4 * input_depth;
+ const int64_t input_height_increment = 4 * input_row_size;
+ const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+ asm volatile(
+ // Performs depthwise convolutions for a window specified by
+ // |output_window_height| and |output_window_width|. The inner-most loop
+ // processes 2x2 outputs, and any leftovers at the end.
+ //
+ // Algorithm works as follows:
+ //
+ // 1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+ // values.
+ // 2. For 2 output heights at a time:
+ // i. For 2 output widths at a time at stride 2, a 5x5 input
+ // window is required. To avoid register exhaustion, we load
+ // the first 2 rows of the 5x5 input window into registers
+ // v9--v18, and use the same registers to load the next 2
+ // rows, and finally v9--v13 to load the last row.
+ // Accumulators for all 2x2 outputs are reserved by registers
+ // v21-v22 (top left output), v23-v24 (top right output),
+ // v19-v20 (bottom left output), v25-v26 (bottom right
+ // output).
+ // ii. Handle single leftover width if exists.
+ // 3. Handle single leftover height if exists.
+ // i. For 2 output widths at a time at stride 2, load inputs for
+ // a 1x2 (1 height, 2 width) output window (3x5 input
+ // window). Registers v9--v24 hold input values. Mul-add with
+ // accumulators v24--v27.
+ // ii. Handle single leftover width if exists.
+ //
+ // Loads are placed as soon as the register is no longer needed and
+ // interleaved with arithmetic operations to take advantage of
+ // dual-issue pipelines. We also add input offsets as far from the loads
+ // as possible to give loads enough cycles to fetch data from memory.
+
+ // Set "constant" registers. These registers may be replaced with temp
+ // values from time to time when there are not enough NEON registers.
+ // We use x9--x15 general purpose registers as they are caller-saved
+ // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf). // NOLINT
+ "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+ "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+ "cmp %w[output_window_height], #2\n"
+ "dup v28.8h, w0\n"
+ "neg w9, w9\n"
+ "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+ "dup v26.4s, w9\n"
+ "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+ "dup v27.4s, w1\n"
+ "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+ "dup v29.4s, w2\n"
+ "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+ "dup v30.4s, w3\n"
+ "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+ "dup v31.4s, w4\n"
+ "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+ "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+
+ // Load filters and add offsets.
+ "add x10, %[bias_ptr], #16\n"
+ "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
+ "dup v9.8h, w20\n"
+ "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
+ "uaddw v0.8h, v9.8h, v0.8b\n"
+ "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
+ "uaddw v1.8h, v9.8h, v1.8b\n"
+ "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
+ "uaddw v2.8h, v9.8h, v2.8b\n"
+ "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
+ "uaddw v3.8h, v9.8h, v3.8b\n"
+ "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
+ "uaddw v4.8h, v9.8h, v4.8b\n"
+ "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
+ "uaddw v5.8h, v9.8h, v5.8b\n"
+ "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
+ "uaddw v6.8h, v9.8h, v6.8b\n"
+ "ld1 {v8.8b}, [%[filter_ptr]]\n"
+ "uaddw v7.8h, v9.8h, v7.8b\n"
+ "uaddw v8.8h, v9.8h, v8.8b\n"
+
+ "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+ //"loop_%=:\n"
+ DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+ // Load the first two rows of the 5x5 input window, then reuse the
+ // same registers to load subsequent rows as they become available.
+ "mov x11, %[input_ptr]\n"
+ "mov x12, x11\n"
+ "add x13, x12, %[input_row_size]\n"
+ "ld1 {v9.8b}, [x12], %[input_depth]\n"
+ "mov w14, %w[output_window_width]\n"
+ "ld1 {v10.8b}, [x12], %[input_depth]\n"
+ // The height 2 / width 2 loop loads an extra 1 output horizontally in
+ // anticipation for the next iteration. Make sure
+ // |output_window_width| is large enough to handle the additional
+ // load, otherwise jump to the appropriate label to handle smaller
+ // widths.
+ "cmp w14, #2\n"
+ "ld1 {v11.8b}, [x12], %[input_depth]\n"
+ "add x15, x13, %[input_row_size]\n"
+ "ld1 {v14.8b}, [x13], %[input_depth]\n"
+ "mov x6, %[output_ptr]\n"
+ "ld1 {v15.8b}, [x13], %[input_depth]\n"
+ "add x7, %[output_ptr], x19\n"
+ "ld1 {v16.8b}, [x13], %[input_depth]\n"
+ "ld1 {v21.4s}, [%[bias_ptr]]\n"
+ "ld1 {v22.4s}, [x10]\n"
+ "ld1 {v23.4s}, [%[bias_ptr]]\n"
+ "uaddw v9.8h, v28.8h, v9.8b\n"
+ "ld1 {v24.4s}, [x10]\n"
+ "uaddw v10.8h, v28.8h, v10.8b\n"
+ "ld1 {v19.4s}, [%[bias_ptr]]\n"
+ "uaddw v11.8h, v28.8h, v11.8b\n"
+ "ld1 {v20.4s}, [x10]\n"
+ "uaddw v14.8h, v28.8h, v14.8b\n"
+ "ld1 {v25.4s}, [%[bias_ptr]]\n"
+ "uaddw v15.8h, v28.8h, v15.8b\n"
+ "ld1 {v26.4s}, [x10]\n"
+ "uaddw v16.8h, v28.8h, v16.8b\n"
+
+ "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+ "cmp w14, #1\n"
+ "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+ //"loop_%=:\n"
+ DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+ "smlal v21.4s, v0.4h, v9.4h\n"
+ "ld1 {v12.8b}, [x12], %[input_depth]\n"
+ "smlal2 v22.4s, v0.8h, v9.8h\n"
+ "ld1 {v13.8b}, [x12]\n"
+ "add x12, x15, %[input_row_size]\n"
+ "smlal v23.4s, v0.4h, v11.4h\n"
+ "ld1 {v17.8b}, [x13], %[input_depth]\n"
+ "smlal2 v24.4s, v0.8h, v11.8h\n"
+ "ld1 {v18.8b}, [x13]\n"
+ "add x13, x12, %[input_row_size]\n"
+ "smlal v21.4s, v1.4h, v10.4h\n"
+ "ld1 {v9.8b}, [x15], %[input_depth]\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "ld1 {v10.8b}, [x15], %[input_depth]\n"
+ "smlal v21.4s, v2.4h, v11.4h\n"
+ "smlal2 v22.4s, v2.8h, v11.8h\n"
+ "ld1 {v11.8b}, [x15], %[input_depth]\n"
+ "smlal v21.4s, v3.4h, v14.4h\n"
+ "smlal2 v22.4s, v3.8h, v14.8h\n"
+ "ld1 {v14.8b}, [x12], %[input_depth]\n"
+ "smlal v23.4s, v3.4h, v16.4h\n"
+ "subs w14, w14, #2\n"
+ "smlal2 v24.4s, v3.8h, v16.8h\n"
+ "cmp w14, #3\n"
+ "smlal v21.4s, v4.4h, v15.4h\n"
+ "uaddw v12.8h, v28.8h, v12.8b\n"
+ "smlal2 v22.4s, v4.8h, v15.8h\n"
+ "ld1 {v15.8b}, [x12], %[input_depth]\n"
+ "smlal v21.4s, v5.4h, v16.4h\n"
+ "uaddw v13.8h, v28.8h, v13.8b\n"
+ "smlal2 v22.4s, v5.8h, v16.8h\n"
+ "ld1 {v16.8b}, [x12], %[input_depth]\n"
+ "smlal v23.4s, v1.4h, v12.4h\n"
+ "uaddw v17.8h, v28.8h, v17.8b\n"
+ "smlal2 v24.4s, v1.8h, v12.8h\n"
+ "ld1 {v12.8b}, [x15], %[input_depth]\n"
+ "smlal v23.4s, v2.4h, v13.4h\n"
+ "uaddw v18.8h, v28.8h, v18.8b\n"
+ "smlal2 v24.4s, v2.8h, v13.8h\n"
+ "ld1 {v13.8b}, [x15]\n"
+ "smlal v23.4s, v4.4h, v17.4h\n"
+ "uaddw v9.8h, v28.8h, v9.8b\n"
+ "smlal2 v24.4s, v4.8h, v17.8h\n"
+ "ld1 {v17.8b}, [x12], %[input_depth]\n"
+ "smlal v23.4s, v5.4h, v18.4h\n"
+ "uaddw v10.8h, v28.8h, v10.8b\n"
+ "smlal2 v24.4s, v5.8h, v18.8h\n"
+ "ld1 {v18.8b}, [x12]\n"
+
+ "smlal v21.4s, v6.4h, v9.4h\n"
+ "smlal2 v22.4s, v6.8h, v9.8h\n"
+ "smlal v19.4s, v0.4h, v9.4h\n"
+ "uaddw v11.8h, v28.8h, v11.8b\n"
+ "smlal2 v20.4s, v0.8h, v9.8h\n"
+ "ld1 {v9.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v6.4h, v11.4h\n"
+ "smlal2 v24.4s, v6.8h, v11.8h\n"
+ "smlal v21.4s, v7.4h, v10.4h\n"
+ "smlal2 v22.4s, v7.8h, v10.8h\n"
+ "uaddw v12.8h, v28.8h, v12.8b\n"
+ "smlal v19.4s, v1.4h, v10.4h\n"
+ "smlal2 v20.4s, v1.8h, v10.8h\n"
+ "ld1 {v10.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v7.4h, v12.4h\n"
+ "smlal2 v24.4s, v7.8h, v12.8h\n"
+ "smlal v25.4s, v1.4h, v12.4h\n"
+ "smlal2 v26.4s, v1.8h, v12.8h\n"
+ "smlal v21.4s, v8.4h, v11.4h\n"
+ "smlal2 v22.4s, v8.8h, v11.8h\n"
+ "add x11, x11, %[input_width_increment]\n"
+ "smlal v19.4s, v2.4h, v11.4h\n"
+ "mov x12, x11\n"
+ "smlal2 v20.4s, v2.8h, v11.8h\n"
+ "uaddw v13.8h, v28.8h, v13.8b\n"
+ "smlal v25.4s, v0.4h, v11.4h\n"
+ "smlal2 v26.4s, v0.8h, v11.8h\n"
+ "ld1 {v11.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v8.4h, v13.4h\n"
+ "ld1 {v12.8b}, [x13], %[input_depth]\n"
+ "smlal2 v24.4s, v8.8h, v13.8h\n"
+ "smlal v25.4s, v2.4h, v13.4h\n"
+ "smlal2 v26.4s, v2.8h, v13.8h\n"
+ "ld1 {v13.8b}, [x13]\n"
+ "add x13, x12, %[input_row_size]\n"
+ "add x15, x13, %[input_row_size]\n"
+
+ "dup v28.4s, w9\n"
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v27.16b, v21.16b, v28.16b\n"
+ "and v29.16b, v22.16b, v28.16b\n"
+ "and v30.16b, v23.16b, v28.16b\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "sshr v27.4s, v27.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "dup v27.4s, w1\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "dup v30.4s, w3\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "dup v31.4s, w4\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "dup v28.8h, w0\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "ld1 {v22.4s}, [x10]\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "ld1 {v24.4s}, [x10]\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "uaddw v9.8h, v28.8h, v9.8b\n"
+ "st1 {v21.8b}, [x6], x5\n"
+ "uaddw v10.8h, v28.8h, v10.8b\n"
+ "st1 {v23.8b}, [x6], x5\n"
+ "uaddw v11.8h, v28.8h, v11.8b\n"
+
+ "smlal v19.4s, v6.4h, v9.4h\n"
+ "smlal2 v20.4s, v6.8h, v9.8h\n"
+ "ld1 {v9.8b}, [x12], %[input_depth]\n"
+ "smlal v25.4s, v6.4h, v11.4h\n"
+ "smlal2 v26.4s, v6.8h, v11.8h\n"
+ "smlal v19.4s, v7.4h, v10.4h\n"
+ "uaddw v12.8h, v28.8h, v12.8b\n"
+ "smlal2 v20.4s, v7.8h, v10.8h\n"
+ "ld1 {v10.8b}, [x12], %[input_depth]\n"
+ "smlal v25.4s, v7.4h, v12.4h\n"
+ "smlal2 v26.4s, v7.8h, v12.8h\n"
+ "smlal v19.4s, v8.4h, v11.4h\n"
+ "uaddw v13.8h, v28.8h, v13.8b\n"
+ "smlal2 v20.4s, v8.8h, v11.8h\n"
+ "ld1 {v11.8b}, [x12], %[input_depth]\n"
+ "smlal v25.4s, v8.4h, v13.4h\n"
+ "uaddw v14.8h, v28.8h, v14.8b\n"
+ "smlal2 v26.4s, v8.8h, v13.8h\n"
+ "uaddw v16.8h, v28.8h, v16.8b\n"
+ "smlal v19.4s, v3.4h, v14.4h\n"
+ "uaddw v15.8h, v28.8h, v15.8b\n"
+ "smlal2 v20.4s, v3.8h, v14.8h\n"
+ "ld1 {v14.8b}, [x13], %[input_depth]\n"
+ "smlal v25.4s, v3.4h, v16.4h\n"
+ "ld1 {v21.4s}, [%[bias_ptr]]\n"
+ "smlal2 v26.4s, v3.8h, v16.8h\n"
+ "ld1 {v23.4s}, [%[bias_ptr]]\n"
+ "smlal v19.4s, v4.4h, v15.4h\n"
+ "uaddw v17.8h, v28.8h, v17.8b\n"
+ "smlal2 v20.4s, v4.8h, v15.8h\n"
+ "ld1 {v15.8b}, [x13], %[input_depth]\n"
+ "smlal v25.4s, v4.4h, v17.4h\n"
+ "smlal2 v26.4s, v4.8h, v17.8h\n"
+ "smlal v19.4s, v5.4h, v16.4h\n"
+ "uaddw v18.8h, v28.8h, v18.8b\n"
+ "smlal2 v20.4s, v5.8h, v16.8h\n"
+ "ld1 {v16.8b}, [x13], %[input_depth]\n"
+ "smlal v25.4s, v5.4h, v18.4h\n"
+ "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+ "dup v28.4s, w9\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+ "and v27.16b, v19.16b, v28.16b\n"
+ "and v29.16b, v20.16b, v28.16b\n"
+ "and v30.16b, v25.16b, v28.16b\n"
+ "and v31.16b, v26.16b, v28.16b\n"
+ "sshr v27.4s, v27.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v19.4s, v19.4s, v27.4s\n"
+ "dup v27.4s, w1\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v25.4s, v25.4s, v30.4s\n"
+ "dup v30.4s, w3\n"
+ "sqadd v26.4s, v26.4s, v31.4s\n"
+ "dup v31.4s, w4\n"
+ "srshl v19.4s, v19.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "srshl v25.4s, v25.4s, v28.4s\n"
+ "srshl v26.4s, v26.4s, v28.4s\n"
+ "dup v28.8h, w0\n"
+ "add v19.4s, v19.4s, v29.4s\n"
+ "add v20.4s, v20.4s, v29.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "smax v19.4s, v19.4s, v30.4s\n"
+ "smax v20.4s, v20.4s, v30.4s\n"
+ "smax v25.4s, v25.4s, v30.4s\n"
+ "smax v26.4s, v26.4s, v30.4s\n"
+ "smin v19.4s, v19.4s, v31.4s\n"
+ "smin v20.4s, v20.4s, v31.4s\n"
+ "smin v25.4s, v25.4s, v31.4s\n"
+ "smin v26.4s, v26.4s, v31.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v19.8h, v20.4s\n"
+ "ld1 {v20.4s}, [x10]\n"
+ "sqxtn2 v25.8h, v26.4s\n"
+ "ld1 {v26.4s}, [x10]\n"
+ "sqxtun v19.8b, v19.8h\n"
+ "sqxtun v25.8b, v25.8h\n"
+ "uaddw v9.8h, v28.8h, v9.8b\n"
+ "st1 {v19.8b}, [x7], x5\n"
+ "uaddw v10.8h, v28.8h, v10.8b\n"
+ "st1 {v25.8b}, [x7], x5\n"
+ "uaddw v11.8h, v28.8h, v11.8b\n"
+ "ld1 {v19.4s}, [%[bias_ptr]]\n"
+ "uaddw v14.8h, v28.8h, v14.8b\n"
+ "ld1 {v25.4s}, [%[bias_ptr]]\n"
+ "uaddw v15.8h, v28.8h, v15.8b\n"
+ "uaddw v16.8h, v28.8h, v16.8b\n"
+
+ "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+ // At this point, there will be one of 2 width or 1 width leftover,
+ // not both.
+ "cmp w14, #2\n"
+ "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+ // Handle last 2 columns if exists.
+ DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+ "smlal v21.4s, v0.4h, v9.4h\n"
+ "ld1 {v12.8b}, [x12], %[input_depth]\n"
+ "smlal2 v22.4s, v0.8h, v9.8h\n"
+ "ld1 {v13.8b}, [x12]\n"
+ "add x12, x15, %[input_row_size]\n"
+ "smlal v23.4s, v0.4h, v11.4h\n"
+ "ld1 {v17.8b}, [x13], %[input_depth]\n"
+ "smlal2 v24.4s, v0.8h, v11.8h\n"
+ "ld1 {v18.8b}, [x13]\n"
+ "add x13, x12, %[input_row_size]\n"
+ "smlal v21.4s, v1.4h, v10.4h\n"
+ "ld1 {v9.8b}, [x15], %[input_depth]\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "ld1 {v10.8b}, [x15], %[input_depth]\n"
+ "smlal v21.4s, v2.4h, v11.4h\n"
+ "smlal2 v22.4s, v2.8h, v11.8h\n"
+ "ld1 {v11.8b}, [x15], %[input_depth]\n"
+ "smlal v21.4s, v3.4h, v14.4h\n"
+ "smlal2 v22.4s, v3.8h, v14.8h\n"
+ "ld1 {v14.8b}, [x12], %[input_depth]\n"
+ "smlal v23.4s, v3.4h, v16.4h\n"
+ "smlal2 v24.4s, v3.8h, v16.8h\n"
+ "smlal v21.4s, v4.4h, v15.4h\n"
+ "uaddw v12.8h, v28.8h, v12.8b\n"
+ "smlal2 v22.4s, v4.8h, v15.8h\n"
+ "ld1 {v15.8b}, [x12], %[input_depth]\n"
+ "smlal v21.4s, v5.4h, v16.4h\n"
+ "uaddw v13.8h, v28.8h, v13.8b\n"
+ "smlal2 v22.4s, v5.8h, v16.8h\n"
+ "ld1 {v16.8b}, [x12], %[input_depth]\n"
+ "smlal v23.4s, v1.4h, v12.4h\n"
+ "uaddw v17.8h, v28.8h, v17.8b\n"
+ "smlal2 v24.4s, v1.8h, v12.8h\n"
+ "ld1 {v12.8b}, [x15], %[input_depth]\n"
+ "smlal v23.4s, v2.4h, v13.4h\n"
+ "uaddw v18.8h, v28.8h, v18.8b\n"
+ "smlal2 v24.4s, v2.8h, v13.8h\n"
+ "ld1 {v13.8b}, [x15]\n"
+ "smlal v23.4s, v4.4h, v17.4h\n"
+ "uaddw v9.8h, v28.8h, v9.8b\n"
+ "smlal2 v24.4s, v4.8h, v17.8h\n"
+ "ld1 {v17.8b}, [x12], %[input_depth]\n"
+ "smlal v23.4s, v5.4h, v18.4h\n"
+ "uaddw v10.8h, v28.8h, v10.8b\n"
+ "smlal2 v24.4s, v5.8h, v18.8h\n"
+ "ld1 {v18.8b}, [x12]\n"
+
+ "smlal v21.4s, v6.4h, v9.4h\n"
+ "smlal2 v22.4s, v6.8h, v9.8h\n"
+ "smlal v19.4s, v0.4h, v9.4h\n"
+ "uaddw v11.8h, v28.8h, v11.8b\n"
+ "smlal2 v20.4s, v0.8h, v9.8h\n"
+ "ld1 {v9.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v6.4h, v11.4h\n"
+ "smlal2 v24.4s, v6.8h, v11.8h\n"
+ "smlal v21.4s, v7.4h, v10.4h\n"
+ "smlal2 v22.4s, v7.8h, v10.8h\n"
+ "uaddw v12.8h, v28.8h, v12.8b\n"
+ "smlal v19.4s, v1.4h, v10.4h\n"
+ "smlal2 v20.4s, v1.8h, v10.8h\n"
+ "ld1 {v10.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v7.4h, v12.4h\n"
+ "smlal2 v24.4s, v7.8h, v12.8h\n"
+ "smlal v25.4s, v1.4h, v12.4h\n"
+ "smlal2 v26.4s, v1.8h, v12.8h\n"
+ "smlal v21.4s, v8.4h, v11.4h\n"
+ "smlal2 v22.4s, v8.8h, v11.8h\n"
+ "smlal v19.4s, v2.4h, v11.4h\n"
+ "smlal2 v20.4s, v2.8h, v11.8h\n"
+ "uaddw v13.8h, v28.8h, v13.8b\n"
+ "smlal v25.4s, v0.4h, v11.4h\n"
+ "smlal2 v26.4s, v0.8h, v11.8h\n"
+ "ld1 {v11.8b}, [x13], %[input_depth]\n"
+ "smlal v23.4s, v8.4h, v13.4h\n"
+ "ld1 {v12.8b}, [x13], %[input_depth]\n"
+ "smlal2 v24.4s, v8.8h, v13.8h\n"
+ "smlal v25.4s, v2.4h, v13.4h\n"
+ "smlal2 v26.4s, v2.8h, v13.8h\n"
+ "ld1 {v13.8b}, [x13]\n"
+
+ "dup v28.4s, w9\n"
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v27.16b, v21.16b, v28.16b\n"
+ "and v29.16b, v22.16b, v28.16b\n"
+ "and v30.16b, v23.16b, v28.16b\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "sshr v27.4s, v27.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "dup v27.4s, w1\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "dup v30.4s, w3\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "dup v31.4s, w4\n"
+ "srshl v21.4s, v21.4s, v28.4s\n"
+ "srshl v22.4s, v22.4s, v28.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "dup v28.8h, w0\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "ld1 {v22.4s}, [x10]\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "ld1 {v24.4s}, [x10]\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "uaddw v9.8h, v28.8h, v9.8b\n"
+ "st1 {v21.8b}, [x6], x5\n"
+ "uaddw v10.8h, v28.8h, v10.8b\n"
+ "st1 {v23.8b}, [x6]\n"
+ "uaddw v11.8h, v28.8h, v11.8b\n"
+
+ "smlal v19.4s, v6.4h, v9.4h\n"
+ "smlal2 v20.4s, v6.8h, v9.8h\n"
+ "smlal v25.4s, v6.4h, v11.4h\n"
+ "smlal2 v26.4s, v6.8h, v11.8h\n"
+ "smlal v19.4s, v7.4h, v10.4h\n"
+ "uaddw v12.8h, v28.8h, v12.8b\n"
+ "smlal2 v20.4s, v7.8h, v10.8h\n"
+ "smlal v25.4s, v7.4h, v12.4h\n"
+ "smlal2 v26.4s, v7.8h, v12.8h\n"
+ "smlal v19.4s, v8.4h, v11.4h\n"
+ "uaddw v13.8h, v28.8h, v13.8b\n"
+ "smlal2 v20.4s, v8.8h, v11.8h\n"
+ "smlal v25.4s, v8.4h, v13.4h\n"
+ "uaddw v14.8h, v28.8h, v14.8b\n"
+ "smlal2 v26.4s, v8.8h, v13.8h\n"
+ "uaddw v16.8h, v28.8h, v16.8b\n"
+ "smlal v19.4s, v3.4h, v14.4h\n"
+ "uaddw v15.8h, v28.8h, v15.8b\n"
+ "smlal2 v20.4s, v3.8h, v14.8h\n"
+ "smlal v25.4s, v3.4h, v16.4h\n"
+ "smlal2 v26.4s, v3.8h, v16.8h\n"
+ "smlal v19.4s, v4.4h, v15.4h\n"
+ "uaddw v17.8h, v28.8h, v17.8b\n"
+ "smlal2 v20.4s, v4.8h, v15.8h\n"
+ "smlal v25.4s, v4.4h, v17.4h\n"
+ "smlal2 v26.4s, v4.8h, v17.8h\n"
+ "smlal v19.4s, v5.4h, v16.4h\n"
+ "uaddw v18.8h, v28.8h, v18.8b\n"
+ "smlal2 v20.4s, v5.8h, v16.8h\n"
+ "smlal v25.4s, v5.4h, v18.4h\n"
+ "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+ "dup v28.4s, w9\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+ "and v27.16b, v19.16b, v28.16b\n"
+ "and v29.16b, v20.16b, v28.16b\n"
+ "and v30.16b, v25.16b, v28.16b\n"
+ "and v31.16b, v26.16b, v28.16b\n"
+ "sshr v27.4s, v27.4s, #31\n"
+ "sshr v29.4s, v29.4s, #31\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v19.4s, v19.4s, v27.4s\n"
+ "dup v27.4s, w1\n"
+ "sqadd v20.4s, v20.4s, v29.4s\n"
+ "dup v29.4s, w2\n"
+ "sqadd v25.4s, v25.4s, v30.4s\n"
+ "dup v30.4s, w3\n"
+ "sqadd v26.4s, v26.4s, v31.4s\n"
+ "dup v31.4s, w4\n"
+ "srshl v19.4s, v19.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "srshl v25.4s, v25.4s, v28.4s\n"
+ "srshl v26.4s, v26.4s, v28.4s\n"
+ "dup v28.8h, w0\n"
+ "add v19.4s, v19.4s, v29.4s\n"
+ "add v20.4s, v20.4s, v29.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "smax v19.4s, v19.4s, v30.4s\n"
+ "smax v20.4s, v20.4s, v30.4s\n"
+ "smax v25.4s, v25.4s, v30.4s\n"
+ "smax v26.4s, v26.4s, v30.4s\n"
+ "smin v19.4s, v19.4s, v31.4s\n"
+ "smin v20.4s, v20.4s, v31.4s\n"
+ "smin v25.4s, v25.4s, v31.4s\n"
+ "smin v26.4s, v26.4s, v31.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn v25.4h, v25.4s\n"
+ "sqxtn2 v19.8h, v20.4s\n"
+ "sqxtn2 v25.8h, v26.4s\n"
+ "sqxtun v19.8b, v19.8h\n"
+ "sqxtun v25.8b, v25.8h\n"
+ "st1 {v19.8b}, [x7], x5\n"
+ "st1 {v25.8b}, [x7]\n"
+ "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+ // Handle last column if exists.
+ DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+ // Registers v9, v10, v11, v14, v15, and v16 have already been loaded
+ // with the correct values at this point. This corresponds to the
+ // first two input rows of the top left output. Now load the last
+ // input row for this output. Once these inputs are no longer needed,
+ // load the input rows for the bottom left output.
+ "add x12, x15, %[input_row_size]\n"
+ "add x13, x12, %[input_row_size]\n"
+
+ "ld1 {v12.8b}, [x15], %[input_depth]\n"
+ "smlal v21.4s, v0.4h, v9.4h\n"
+ "ld1 {v13.8b}, [x15], %[input_depth]\n"
+ "smlal2 v22.4s, v0.8h, v9.8h\n"
+ "ld1 {v17.8b}, [x15]\n"
+ "smlal v21.4s, v1.4h, v10.4h\n"
+ "ld1 {v9.8b}, [x12], %[input_depth]\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "ld1 {v10.8b}, [x12], %[input_depth]\n"
+ "smlal v21.4s, v2.4h, v11.4h\n"
+ "smlal2 v22.4s, v2.8h, v11.8h\n"
+ "ld1 {v11.8b}, [x12]\n"
+ "smlal v21.4s, v3.4h, v14.4h\n"
+ "smlal2 v22.4s, v3.8h, v14.8h\n"
+ "ld1 {v14.8b}, [x13], %[input_depth]\n"
+ "smlal v21.4s, v4.4h, v15.4h\n"
+ "smlal2 v22.4s, v4.8h, v15.8h\n"
+ "ld1 {v15.8b}, [x13], %[input_depth]\n"
+ "smlal v21.4s, v5.4h, v16.4h\n"
+ "uaddw v12.8h, v28.8h, v12.8b\n"
+ "smlal2 v22.4s, v5.8h, v16.8h\n"
+ "uaddw v13.8h, v28.8h, v13.8b\n"
+ "ld1 {v16.8b}, [x13]\n"
+
+ "smlal v21.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "smlal v23.4s, v0.4h, v12.4h\n"
+ "uaddw v17.8h, v28.8h, v17.8b\n"
+ "smlal2 v24.4s, v0.8h, v12.8h\n"
+ "smlal v21.4s, v7.4h, v13.4h\n"
+ "smlal2 v22.4s, v7.8h, v13.8h\n"
+ "smlal v23.4s, v1.4h, v13.4h\n"
+ "smlal2 v24.4s, v1.8h, v13.8h\n"
+ "smlal v21.4s, v8.4h, v17.4h\n"
+ "smlal2 v22.4s, v8.8h, v17.8h\n"
+ "smlal v23.4s, v2.4h, v17.4h\n"
+ "smlal2 v24.4s, v2.8h, v17.8h\n"
+
+ "dup v26.4s, w9\n"
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v18.16b, v21.16b, v26.16b\n"
+ "and v19.16b, v22.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #31\n"
+ "sshr v19.4s, v19.4s, #31\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "srshl v21.4s, v21.4s, v26.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v22.4s, v22.4s, v29.4s\n"
+ "smax v21.4s, v21.4s, v30.4s\n"
+ "smax v22.4s, v22.4s, v30.4s\n"
+ "smin v21.4s, v21.4s, v31.4s\n"
+ "smin v22.4s, v22.4s, v31.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "sqxtn2 v21.8h, v22.4s\n"
+ "sqxtun v21.8b, v21.8h\n"
+ "uaddw v9.8h, v28.8h, v9.8b\n"
+ "st1 {v21.8b}, [x6]\n"
+ "uaddw v10.8h, v28.8h, v10.8b\n"
+
+ "smlal v23.4s, v3.4h, v9.4h\n"
+ "uaddw v11.8h, v28.8h, v11.8b\n"
+ "smlal2 v24.4s, v3.8h, v9.8h\n"
+ "uaddw v14.8h, v28.8h, v14.8b\n"
+ "smlal v23.4s, v4.4h, v10.4h\n"
+ "uaddw v15.8h, v28.8h, v15.8b\n"
+ "smlal2 v24.4s, v4.8h, v10.8h\n"
+ "uaddw v16.8h, v28.8h, v16.8b\n"
+ "smlal v23.4s, v5.4h, v11.4h\n"
+ "smlal2 v24.4s, v5.8h, v11.8h\n"
+
+ "smlal v23.4s, v6.4h, v14.4h\n"
+ "smlal2 v24.4s, v6.8h, v14.8h\n"
+ "smlal v23.4s, v7.4h, v15.4h\n"
+ "smlal2 v24.4s, v7.8h, v15.8h\n"
+ "smlal v23.4s, v8.4h, v16.4h\n"
+ "smlal2 v24.4s, v8.8h, v16.8h\n"
+
+ "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "and v18.16b, v23.16b, v26.16b\n"
+ "and v19.16b, v24.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #31\n"
+ "sshr v19.4s, v19.4s, #31\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "srshl v23.4s, v23.4s, v26.4s\n"
+ "srshl v24.4s, v24.4s, v26.4s\n"
+ "add v23.4s, v23.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "smax v23.4s, v23.4s, v30.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smin v23.4s, v23.4s, v31.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v23.8h, v24.4s\n"
+ "sqxtun v23.8b, v23.8h\n"
+ "st1 {v23.8b}, [x7]\n"
+
+ DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+ "subs %w[output_window_height], %w[output_window_height], #2\n"
+ "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+ "cmp %w[output_window_height], #2\n"
+ "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+ "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+ DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+ "cmp %w[output_window_height], #1\n"
+ "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+ DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+ "mov x11, %[input_ptr]\n"
+ "mov x12, x11\n"
+ "add x13, x12, %[input_row_size]\n"
+ "ld1 {v9.8b}, [x12], %[input_depth]\n"
+ "add x15, x13, %[input_row_size]\n"
+ "ld1 {v10.8b}, [x12], %[input_depth]\n"
+ "mov x6, %[output_ptr]\n"
+ "ld1 {v11.8b}, [x12], %[input_depth]\n"
+ "mov w14, %w[output_window_width]\n"
+ // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+ // for the next iteration. Make sure |output_window_width| is large
+ // enough to handle the additional load, otherwise jump to the
+ // appropriate label to handle smaller widths.
+ "cmp w14, #2\n"
+ "ld1 {v12.8b}, [x13], %[input_depth]\n"
+ "ld1 {v13.8b}, [x13], %[input_depth]\n"
+ "ld1 {v14.8b}, [x13], %[input_depth]\n"
+ "ld1 {v15.8b}, [x15], %[input_depth]\n"
+ "ld1 {v16.8b}, [x15], %[input_depth]\n"
+ "ld1 {v17.8b}, [x15], %[input_depth]\n"
+
+ "uaddw v9.8h, v28.8h, v9.8b\n"
+ "ld1 {v24.4s}, [%[bias_ptr]]\n"
+ "uaddw v10.8h, v28.8h, v10.8b\n"
+ "ld1 {v25.4s}, [x10]\n"
+ "uaddw v11.8h, v28.8h, v11.8b\n"
+ "ld1 {v26.4s}, [%[bias_ptr]]\n"
+ "ld1 {v27.4s}, [x10]\n"
+ "uaddw v12.8h, v28.8h, v12.8b\n"
+ "uaddw v13.8h, v28.8h, v13.8b\n"
+ "uaddw v14.8h, v28.8h, v14.8b\n"
+ "uaddw v15.8h, v28.8h, v15.8b\n"
+ "uaddw v16.8h, v28.8h, v16.8b\n"
+ "uaddw v17.8h, v28.8h, v17.8b\n"
+
+ "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+ "cmp w14, #1\n"
+ "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+ //"loop_%=:\n"
+ DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+ "smlal v24.4s, v0.4h, v9.4h\n"
+ "ld1 {v18.8b}, [x12], %[input_depth]\n"
+ "smlal2 v25.4s, v0.8h, v9.8h\n"
+ "ld1 {v19.8b}, [x12]\n"
+ "smlal v26.4s, v0.4h, v11.4h\n"
+ "ld1 {v20.8b}, [x13], %[input_depth]\n"
+ "smlal2 v27.4s, v0.8h, v11.8h\n"
+ "ld1 {v21.8b}, [x13]\n"
+ "smlal v24.4s, v1.4h, v10.4h\n"
+ "ld1 {v22.8b}, [x15], %[input_depth]\n"
+ "smlal2 v25.4s, v1.8h, v10.8h\n"
+ "ld1 {v23.8b}, [x15]\n"
+ "smlal v24.4s, v2.4h, v11.4h\n"
+ "subs w14, w14, #2\n"
+ "smlal2 v25.4s, v2.8h, v11.8h\n"
+ "cmp w14, #3\n"
+ "smlal v24.4s, v3.4h, v12.4h\n"
+ "add x11, x11, %[input_width_increment]\n"
+ "smlal2 v25.4s, v3.8h, v12.8h\n"
+ "mov x12, x11\n"
+ "smlal v26.4s, v3.4h, v14.4h\n"
+ "add x13, x12, %[input_row_size]\n"
+ "smlal2 v27.4s, v3.8h, v14.8h\n"
+ "add x15, x13, %[input_row_size]\n"
+ "smlal v24.4s, v4.4h, v13.4h\n"
+ "ld1 {v9.8b}, [x12], %[input_depth]\n"
+ "smlal2 v25.4s, v4.8h, v13.8h\n"
+ "ld1 {v10.8b}, [x12], %[input_depth]\n"
+ "smlal v24.4s, v5.4h, v14.4h\n"
+ "ld1 {v11.8b}, [x12], %[input_depth]\n"
+ "smlal2 v25.4s, v5.8h, v14.8h\n"
+ "ld1 {v12.8b}, [x13], %[input_depth]\n"
+ "smlal v24.4s, v6.4h, v15.4h\n"
+ "ld1 {v13.8b}, [x13], %[input_depth]\n"
+ "smlal2 v25.4s, v6.8h, v15.8h\n"
+ "ld1 {v14.8b}, [x13], %[input_depth]\n"
+ "smlal v26.4s, v6.4h, v17.4h\n"
+ "ld1 {v15.8b}, [x15], %[input_depth]\n"
+ "smlal2 v27.4s, v6.8h, v17.8h\n"
+ "smlal v24.4s, v7.4h, v16.4h\n"
+ "smlal2 v25.4s, v7.8h, v16.8h\n"
+ "ld1 {v16.8b}, [x15], %[input_depth]\n"
+ "smlal v24.4s, v8.4h, v17.4h\n"
+ "uaddw v18.8h, v28.8h, v18.8b\n"
+ "smlal2 v25.4s, v8.8h, v17.8h\n"
+ "ld1 {v17.8b}, [x15], %[input_depth]\n"
+ "uaddw v19.8h, v28.8h, v19.8b\n"
+
+ "smlal v26.4s, v1.4h, v18.4h\n"
+ "uaddw v20.8h, v28.8h, v20.8b\n"
+ "smlal2 v27.4s, v1.8h, v18.8h\n"
+ "smlal v26.4s, v2.4h, v19.4h\n"
+ "uaddw v21.8h, v28.8h, v21.8b\n"
+ "smlal2 v27.4s, v2.8h, v19.8h\n"
+ "smlal v26.4s, v4.4h, v20.4h\n"
+ "smlal v26.4s, v5.4h, v21.4h\n"
+ "smlal2 v27.4s, v4.8h, v20.8h\n"
+ "uaddw v22.8h, v28.8h, v22.8b\n"
+ "smlal2 v27.4s, v5.8h, v21.8h\n"
+ "uaddw v23.8h, v28.8h, v23.8b\n"
+ "smlal v26.4s, v7.4h, v22.4h\n"
+ "smlal2 v27.4s, v7.8h, v22.8h\n"
+ "smlal v26.4s, v8.4h, v23.4h\n"
+ "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+ "dup v28.4s, w1\n"
+ "dup v29.4s, w9\n"
+ "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+ "dup v28.4s, w2\n"
+ "and v30.16b, v24.16b, v29.16b\n"
+ "and v31.16b, v25.16b, v29.16b\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v24.4s, v24.4s, v30.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "and v30.16b, v26.16b, v29.16b\n"
+ "and v31.16b, v27.16b, v29.16b\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v26.4s, v26.4s, v30.4s\n"
+ "dup v30.4s, w3\n"
+ "sqadd v27.4s, v27.4s, v31.4s\n"
+ "dup v31.4s, w4\n"
+ "srshl v24.4s, v24.4s, v29.4s\n"
+ "srshl v25.4s, v25.4s, v29.4s\n"
+ "srshl v26.4s, v26.4s, v29.4s\n"
+ "srshl v27.4s, v27.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v28.4s\n"
+ "add v25.4s, v25.4s, v28.4s\n"
+ "add v26.4s, v26.4s, v28.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "dup v28.8h, w0\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smax v25.4s, v25.4s, v30.4s\n"
+ "smax v26.4s, v26.4s, v30.4s\n"
+ "smax v27.4s, v27.4s, v30.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "smin v25.4s, v25.4s, v31.4s\n"
+ "smin v26.4s, v26.4s, v31.4s\n"
+ "smin v27.4s, v27.4s, v31.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "sqxtn v26.4h, v26.4s\n"
+ "sqxtn2 v24.8h, v25.4s\n"
+ "ld1 {v25.4s}, [x10]\n"
+ "sqxtn2 v26.8h, v27.4s\n"
+ "ld1 {v27.4s}, [x10]\n"
+ "sqxtun v24.8b, v24.8h\n"
+ "sqxtun v26.8b, v26.8h\n"
+ "uaddw v9.8h, v28.8h, v9.8b\n"
+ "st1 {v24.8b}, [x6], x5\n"
+ "uaddw v10.8h, v28.8h, v10.8b\n"
+ "st1 {v26.8b}, [x6], x5\n"
+ "uaddw v11.8h, v28.8h, v11.8b\n"
+ "uaddw v12.8h, v28.8h, v12.8b\n"
+ "uaddw v13.8h, v28.8h, v13.8b\n"
+ "uaddw v14.8h, v28.8h, v14.8b\n"
+ "ld1 {v24.4s}, [%[bias_ptr]]\n"
+ "uaddw v15.8h, v28.8h, v15.8b\n"
+ "ld1 {v26.4s}, [%[bias_ptr]]\n"
+ "uaddw v16.8h, v28.8h, v16.8b\n"
+ "uaddw v17.8h, v28.8h, v17.8b\n"
+
+ "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+ // At this point, there will be one of 2 width or 1 width leftover,
+ // not both.
+ "cmp w14, #2\n"
+ "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+ // Handle last two horizontal outputs if exists.
+ DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+ "smlal v24.4s, v0.4h, v9.4h\n"
+ "ld1 {v18.8b}, [x12], %[input_depth]\n"
+ "smlal2 v25.4s, v0.8h, v9.8h\n"
+ "ld1 {v19.8b}, [x12]\n"
+ "smlal v26.4s, v0.4h, v11.4h\n"
+ "ld1 {v20.8b}, [x13], %[input_depth]\n"
+ "smlal2 v27.4s, v0.8h, v11.8h\n"
+ "ld1 {v21.8b}, [x13]\n"
+ "smlal v24.4s, v1.4h, v10.4h\n"
+ "ld1 {v22.8b}, [x15], %[input_depth]\n"
+ "smlal2 v25.4s, v1.8h, v10.8h\n"
+ "ld1 {v23.8b}, [x15]\n"
+ "smlal v24.4s, v2.4h, v11.4h\n"
+ "smlal2 v25.4s, v2.8h, v11.8h\n"
+ "smlal v24.4s, v3.4h, v12.4h\n"
+ "smlal2 v25.4s, v3.8h, v12.8h\n"
+ "smlal v26.4s, v3.4h, v14.4h\n"
+ "smlal2 v27.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v4.4h, v13.4h\n"
+ "smlal2 v25.4s, v4.8h, v13.8h\n"
+ "smlal v24.4s, v5.4h, v14.4h\n"
+ "smlal2 v25.4s, v5.8h, v14.8h\n"
+ "smlal v24.4s, v6.4h, v15.4h\n"
+ "smlal2 v25.4s, v6.8h, v15.8h\n"
+ "smlal v26.4s, v6.4h, v17.4h\n"
+ "smlal2 v27.4s, v6.8h, v17.8h\n"
+ "smlal v24.4s, v7.4h, v16.4h\n"
+ "smlal2 v25.4s, v7.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v17.4h\n"
+ "uaddw v18.8h, v28.8h, v18.8b\n"
+ "smlal2 v25.4s, v8.8h, v17.8h\n"
+ "uaddw v19.8h, v28.8h, v19.8b\n"
+
+ "smlal v26.4s, v1.4h, v18.4h\n"
+ "uaddw v20.8h, v28.8h, v20.8b\n"
+ "smlal2 v27.4s, v1.8h, v18.8h\n"
+ "smlal v26.4s, v2.4h, v19.4h\n"
+ "uaddw v21.8h, v28.8h, v21.8b\n"
+ "smlal2 v27.4s, v2.8h, v19.8h\n"
+ "smlal v26.4s, v4.4h, v20.4h\n"
+ "smlal v26.4s, v5.4h, v21.4h\n"
+ "smlal2 v27.4s, v4.8h, v20.8h\n"
+ "uaddw v22.8h, v28.8h, v22.8b\n"
+ "smlal2 v27.4s, v5.8h, v21.8h\n"
+ "uaddw v23.8h, v28.8h, v23.8b\n"
+ "smlal v26.4s, v7.4h, v22.4h\n"
+ "smlal2 v27.4s, v7.8h, v22.8h\n"
+ "smlal v26.4s, v8.4h, v23.4h\n"
+ "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+ "dup v28.4s, w1\n"
+ "dup v29.4s, w9\n"
+ "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+ "dup v28.4s, w2\n"
+ "and v30.16b, v24.16b, v29.16b\n"
+ "and v31.16b, v25.16b, v29.16b\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v24.4s, v24.4s, v30.4s\n"
+ "sqadd v25.4s, v25.4s, v31.4s\n"
+ "and v30.16b, v26.16b, v29.16b\n"
+ "and v31.16b, v27.16b, v29.16b\n"
+ "sshr v30.4s, v30.4s, #31\n"
+ "sshr v31.4s, v31.4s, #31\n"
+ "sqadd v26.4s, v26.4s, v30.4s\n"
+ "dup v30.4s, w3\n"
+ "sqadd v27.4s, v27.4s, v31.4s\n"
+ "dup v31.4s, w4\n"
+ "srshl v24.4s, v24.4s, v29.4s\n"
+ "srshl v25.4s, v25.4s, v29.4s\n"
+ "srshl v26.4s, v26.4s, v29.4s\n"
+ "srshl v27.4s, v27.4s, v29.4s\n"
+ "add v24.4s, v24.4s, v28.4s\n"
+ "add v25.4s, v25.4s, v28.4s\n"
+ "add v26.4s, v26.4s, v28.4s\n"
+ "add v27.4s, v27.4s, v28.4s\n"
+ "dup v28.8h, w0\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smax v25.4s, v25.4s, v30.4s\n"
+ "smax v26.4s, v26.4s, v30.4s\n"
+ "smax v27.4s, v27.4s, v30.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "smin v25.4s, v25.4s, v31.4s\n"
+ "smin v26.4s, v26.4s, v31.4s\n"
+ "smin v27.4s, v27.4s, v31.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "sqxtn v26.4h, v26.4s\n"
+ "sqxtn2 v24.8h, v25.4s\n"
+ "sqxtn2 v26.8h, v27.4s\n"
+ "sqxtun v24.8b, v24.8h\n"
+ "sqxtun v26.8b, v26.8h\n"
+ "st1 {v24.8b}, [x6], x5\n"
+ "st1 {v26.8b}, [x6]\n"
+ "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+ // Handle bottom right output if exists.
+ DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+ "dup v26.4s, w9\n"
+ "dup v27.4s, w1\n"
+ "dup v29.4s, w2\n"
+
+ "smlal v24.4s, v0.4h, v9.4h\n"
+ "smlal2 v25.4s, v0.8h, v9.8h\n"
+ "smlal v24.4s, v1.4h, v10.4h\n"
+ "smlal2 v25.4s, v1.8h, v10.8h\n"
+ "smlal v24.4s, v2.4h, v11.4h\n"
+ "smlal2 v25.4s, v2.8h, v11.8h\n"
+ "smlal v24.4s, v3.4h, v12.4h\n"
+ "smlal2 v25.4s, v3.8h, v12.8h\n"
+ "smlal v24.4s, v4.4h, v13.4h\n"
+ "smlal2 v25.4s, v4.8h, v13.8h\n"
+ "smlal v24.4s, v5.4h, v14.4h\n"
+ "smlal2 v25.4s, v5.8h, v14.8h\n"
+ "smlal v24.4s, v6.4h, v15.4h\n"
+ "smlal2 v25.4s, v6.8h, v15.8h\n"
+ "smlal v24.4s, v7.4h, v16.4h\n"
+ "smlal2 v25.4s, v7.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v17.4h\n"
+ "smlal2 v25.4s, v8.8h, v17.8h\n"
+
+ "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+ "and v18.16b, v24.16b, v26.16b\n"
+ "and v19.16b, v25.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #31\n"
+ "sshr v19.4s, v19.4s, #31\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "srshl v24.4s, v24.4s, v26.4s\n"
+ "srshl v25.4s, v25.4s, v26.4s\n"
+ "add v24.4s, v24.4s, v29.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "smax v24.4s, v24.4s, v30.4s\n"
+ "smax v25.4s, v25.4s, v30.4s\n"
+ "smin v24.4s, v24.4s, v31.4s\n"
+ "smin v25.4s, v25.4s, v31.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "sqxtn2 v24.8h, v25.4s\n"
+ "sqxtun v24.8b, v24.8h\n"
+ "st1 {v24.8b}, [x6]\n"
+
+ DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+ :
+ // Outputs.
+ [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+ [output_ptr] "+r"(output_ptr),
+ [output_window_height] "+r"(output_window_height)
+ :
+ // Inputs.
+ [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+ [input_depth] "r"(input_depth),
+ [output_window_width] "r"(output_window_width),
+ [input_width_increment] "r"(input_width_increment),
+ [input_height_increment] "r"(input_height_increment),
+ [output_height_increment] "r"(output_height_increment),
+ [params_ptr] "r"(params_ptr)
+ :
+ // Clobbers.
+ "cc", "memory",
+ // We use these NEON registers.
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+ "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+ "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+ "v30", "v31",
+ // We use these general-purpose registers.
+ "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+ "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+ "x19", "x20");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}
};
-template <>
-struct ConvKernel3x3FilterDepth8<1, 4, 2, 2> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8;
- uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
- temp_8;
-
+#undef OFFSET_INPUT_DEPTH
+#undef OFFSET_INPUT_ROW_SIZE
+#undef OFFSET_OUTPUT_DEPTH
+#undef OFFSET_OUTPUT_ROW_SIZE
+#undef OFFSET_INPUT_OFFSET
+#undef OFFSET_OUTPUT_OFFSET
+#undef OFFSET_FILTER_OFFSET
+#undef OFFSET_OUTPUT_MULTIPLIER
+#undef OFFSET_OUTPUT_ACTIVATION_MIN
+#undef OFFSET_OUTPUT_ACTIVATION_MAX
+#undef OFFSET_OUTPUT_SHIFT
+#undef OFFSET_INPUT_WIDTH
+#undef OFFSET_INPUT_HEIGHT
+#undef OFFSET_OUTPUT_WIDTH
+#undef OFFSET_OUTPUT_HEIGHT
+#undef STR
+#undef STR_UNEXPANDED
+
+// Copies a subset of the input designated by |input_ptr| into |output_ptr|
+// with the specified output dimensions. Supports output depths of 64 only as
+// this is the cache line size.
+inline void ShuffleInput(const uint8* input_ptr, int64_t input_depth,
+ int32 input_width, int32 input_height,
+ int64_t output_depth, int32 output_width,
+ int32 output_height, uint8* output_ptr) {
+ const int64_t input_row_size = input_depth * input_width;
+ for (int32 y = 0; y < output_height; y++) {
const uint8* ptr = input_ptr;
-
- // Load all inputs for top output.
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- temp_2 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- temp_5 = vld1_u8(ptr + 2 * input_depth);
- ptr += input_row_size;
- temp_6 = vld1_u8(ptr);
- temp_7 = vld1_u8(ptr + input_depth);
- temp_8 = vld1_u8(ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
-
- DotProductAndStore(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
-
- // Second output.
- output_ptr += output_depth;
-
- ptr = input_ptr + 3 * input_depth;
- temp_0 = vld1_u8(ptr);
- temp_1 = vld1_u8(ptr + input_depth);
- ptr += input_row_size;
- temp_3 = vld1_u8(ptr);
- temp_4 = vld1_u8(ptr + input_depth);
- ptr += input_row_size;
- temp_6 = vld1_u8(ptr);
- temp_7 = vld1_u8(ptr + input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
-
- DotProductAndStore(
- filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
- input_6, input_7, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
-
- // Third output.
- output_ptr += output_depth;
-
- ptr = input_ptr + 5 * input_depth;
- temp_2 = vld1_u8(ptr);
- temp_0 = vld1_u8(ptr + input_depth);
- ptr += input_row_size;
- temp_5 = vld1_u8(ptr);
- temp_3 = vld1_u8(ptr + input_depth);
- ptr += input_row_size;
- temp_8 = vld1_u8(ptr);
- temp_6 = vld1_u8(ptr + input_depth);
-
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
-
- DotProductAndStore(
- filter, input_1, input_2, input_0, input_4, input_5, input_3, input_7,
- input_8, input_6, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
-
- // Fourth output.
- output_ptr += output_depth;
-
- ptr = input_ptr + 7 * input_depth;
- temp_1 = vld1_u8(ptr);
- temp_2 = vld1_u8(ptr + input_depth);
- ptr += input_row_size;
- temp_4 = vld1_u8(ptr);
- temp_5 = vld1_u8(ptr + input_depth);
- ptr += input_row_size;
- temp_7 = vld1_u8(ptr);
- temp_8 = vld1_u8(ptr + input_depth);
-
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
-
- DotProductAndStore(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
- }
-};
-
-template <int kFixedStrideWidth, int kFixedStrideHeight>
-struct ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight> {
- static inline void Run(const uint8* input_ptr, int input_depth,
- int32 input_offset, int input_row_size,
- const uint8* filter_ptr, int32 filter_offset,
- const int32* bias_ptr, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_ptr,
- int output_depth, int output_width) {
- Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
- int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8;
-
- uint8x8_t temp_0 = vld1_u8(input_ptr);
- uint8x8_t temp_1 = vld1_u8(input_ptr + input_depth);
- uint8x8_t temp_2 = vld1_u8(input_ptr + 2 * input_depth);
-
- input_ptr += input_row_size;
- uint8x8_t temp_3 = vld1_u8(input_ptr);
- uint8x8_t temp_4 = vld1_u8(input_ptr + input_depth);
- uint8x8_t temp_5 = vld1_u8(input_ptr + 2 * input_depth);
-
- input_ptr += input_row_size;
- uint8x8_t temp_6 = vld1_u8(input_ptr);
- uint8x8_t temp_7 = vld1_u8(input_ptr + input_depth);
- uint8x8_t temp_8 = vld1_u8(input_ptr + 2 * input_depth);
-
- input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
- input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
- input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
- input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
- input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
- input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
- input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
- input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
- input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
- const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
- input_0 = vaddq_s16(input_0, input_offset_vec);
- input_1 = vaddq_s16(input_1, input_offset_vec);
- input_2 = vaddq_s16(input_2, input_offset_vec);
- input_3 = vaddq_s16(input_3, input_offset_vec);
- input_4 = vaddq_s16(input_4, input_offset_vec);
- input_5 = vaddq_s16(input_5, input_offset_vec);
- input_6 = vaddq_s16(input_6, input_offset_vec);
- input_7 = vaddq_s16(input_7, input_offset_vec);
- input_8 = vaddq_s16(input_8, input_offset_vec);
-
- DotProductAndStore(
- filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
- input_7, input_8, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max, output_ptr);
- }
-};
-
-inline void ShuffleInput(const uint8* input_ptr, int input_depth,
- int input_width, int input_height, int output_depth,
- int output_width, int output_height,
- uint8* output_ptr) {
- const int input_row_size = input_depth * input_width;
-
- for (int y = 0; y < output_height; y++) {
- const uint8* ptr = input_ptr;
- for (int x = 0; x < output_width; x++) {
+ for (int32 x = 0; x < output_width; x++) {
memcpy(output_ptr, ptr, output_depth);
output_ptr += output_depth;
ptr += input_depth;
}
}
-template <int kFixedHeight, int kFixedStrideWidth, int kFixedStrideHeight>
-struct ConvRow3x3FilterDepth8 {};
-
-template <int kFixedStrideWidth, int kFixedStrideHeight>
-struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth, kFixedStrideHeight> {
- static inline void Run(const uint8* input_data, int start_x, int start_y,
- int input_depth, int input_width, int input_height,
- int input_row_size, int32 input_offset,
- const uint8* filter_data, int32 filter_offset,
- const int32* bias_data, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- int output_depth, int output_width,
- uint8* shuffle_workspace) {
- int out_x = start_x;
-
- // 1x4 at a time.
- for (; out_x <= output_width - 4; out_x += 4) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<1, 4, kFixedStrideWidth, kFixedStrideHeight>::
- Run(input_ptr, input_depth, input_offset, input_row_size,
- filter_ptr, filter_offset, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
-
- input_data += 4 * kFixedStrideWidth * input_depth;
- output_data += 4 * output_depth;
- }
-
- // 1x1 at a time.
- for (; out_x < output_width; out_x++) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight>::
- Run(input_ptr, input_depth, input_offset, input_row_size,
- filter_ptr, filter_offset, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
+// Calculates the input size depending on stride and output.
+inline int32 get_shuffle_input_size(int32 stride, int32 output) {
+ return stride * (output - 1) + 3;
+}
- input_data += kFixedStrideWidth * input_depth;
- output_data += output_depth;
- }
+// Indicates the input and output dimensions used when shuffling input
+// activations.
+struct ShuffleParams {
+ int32 output_width;
+ int32 output_height;
+ int32 input_width;
+ int32 input_height;
+
+ ShuffleParams() = default;
+ ShuffleParams(int32 output_width, int32 output_height, int32 stride_width,
+ int32 stride_height)
+ : output_width(output_width)
+ , output_height(output_height)
+ , input_width(get_shuffle_input_size(stride_width, output_width))
+ , input_height(get_shuffle_input_size(stride_height, output_height)) {
}
};
-template <int kFixedStrideWidth, int kFixedStrideHeight>
-struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth, kFixedStrideHeight> {
- static inline void Run(const uint8* input_data, int start_x, int start_y,
- int input_depth, int input_width, int input_height,
- int input_row_size, int32 input_offset,
- const uint8* filter_data, int32 filter_offset,
- const int32* bias_data, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- int output_depth, int output_width,
- uint8* shuffle_workspace) {
- int out_x = start_x;
-
- // 2x4 at a time.
- for (; out_x <= output_width - 4; out_x += 4) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<2, 4, kFixedStrideWidth, kFixedStrideHeight>::
- Run(input_ptr, input_depth, input_offset, input_row_size,
- filter_ptr, filter_offset, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
-
- input_data += 4 * kFixedStrideWidth * input_depth;
- output_data += 4 * output_depth;
- }
-
- // 2x2 at a time.
- for (; out_x <= output_width - 2; out_x += 2) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<2, 2, kFixedStrideWidth, kFixedStrideHeight>::
- Run(input_ptr, input_depth, input_offset, input_row_size,
- filter_ptr, filter_offset, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
-
- input_data += 2 * kFixedStrideWidth * input_depth;
- output_data += 2 * output_depth;
- }
-
- // 2x1 at a time.
- for (; out_x < output_width; out_x++) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<2, 1, kFixedStrideWidth, kFixedStrideHeight>::
- Run(input_ptr, input_depth, input_offset, input_row_size,
- filter_ptr, filter_offset, bias_ptr, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
-
- input_data += kFixedStrideWidth * input_depth;
- output_data += output_depth;
+template <int32 kStrideWidth, int32 kStrideHeight>
+struct DepthwiseConvThroughDepth {
+ // Runs the DepthwiseConvWindow kernels through the depth dimension from
+ // |start_depth| to |end_depth|. Keep this not inlined to maintain a small
+ // binary size. We use a DepthwiseConvParams struct for read only params
+ // to minimize call overhead.
+ static __attribute__((noinline)) void Run(const uint8* input_ptr,
+ const uint8* filter_ptr, const int32* bias_ptr, uint8* output_ptr,
+ int64_t start_depth, int64_t end_depth, int64_t input_depth,
+ int64_t input_row_size, int32 output_window_height,
+ int32 output_window_width, const DepthwiseConvParams& params) {
+ for (; start_depth <= end_depth - 8; start_depth += 8) {
+ DepthwiseConvWindow<8, kStrideWidth, kStrideHeight>::Run(
+ input_ptr, filter_ptr, bias_ptr, output_ptr, input_depth,
+ input_row_size, output_window_height, output_window_width, ¶ms);
+ input_ptr += 8;
+ output_ptr += 8;
+ filter_ptr += 8;
+ bias_ptr += 8;
}
}
};
-template <>
-struct ConvRow3x3FilterDepth8<4, 1, 1> {
- static inline void Run(const uint8* input_data, int start_x, int start_y,
- int input_depth, int input_width, int input_height,
- int input_row_size, int32 input_offset,
- const uint8* filter_data, int32 filter_offset,
- const int32* bias_data, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- int output_depth, int output_width,
- uint8* shuffle_workspace) {
- int out_x = start_x;
-
- // 4x4 at a time.
- for (; out_x <= output_width - 4; out_x += 4) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<4, 4, 1, 1>::Run(
- input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
-
- input_data += 4 * input_depth;
- output_data += 4 * output_depth;
- }
-
- // Handle the rest of the right side.
- // 4x2 at a time.
- for (; out_x <= output_width - 2; out_x += 2) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<4, 2, 1, 1>::Run(
- input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
-
- input_data += 2 * input_depth;
- output_data += 2 * output_depth;
- }
-
- // 4x1 at a time.
- for (; out_x < output_width; out_x++) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<4, 1, 1, 1>::Run(
- input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
-
- input_data += input_depth;
- output_data += output_depth;
- }
- }
-};
+template <int32 kStrideWidth, int32 kStrideHeight>
+struct DepthwiseConvMultiRow {
+ using ConvKernel = DepthwiseConvThroughDepth<kStrideWidth, kStrideHeight>;
-template <>
-struct ConvRow3x3FilterDepth8<4, 2, 2> {
- // The buffer size of the shuffled input.
- static inline constexpr int ShuffleWorkspaceSize() { return 64 * 9 * 9; }
-
- static inline void Run(const uint8* input_data, int start_x, int start_y,
- int input_depth, int input_width, int input_height,
- int input_row_size, int32 input_offset,
- const uint8* filter_data, int32 filter_offset,
- const int32* bias_data, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- int output_depth, int output_width,
+ static inline void Run(const uint8* input_data, int32 start_x, int32 start_y,
+ const uint8* filter_data, const int32* bias_data,
+ uint8* output_data, const DepthwiseConvParams& params,
+ const ShuffleParams& shuffle_params,
uint8* shuffle_workspace) {
- // Branch and cache misses increase substantially with stride 2 kernels.
- // Adding prefetching reduces latency by as much as 2x.
- const int i0 = 0;
- const int i1 = input_depth;
- const int i2 = 2 * input_depth;
- const int i3 = 3 * input_depth;
- const int i4 = 4 * input_depth;
- const int i5 = 5 * input_depth;
- const int i6 = 6 * input_depth;
- const int i7 = 7 * input_depth;
- const int i8 = 8 * input_depth;
-
-#define DEPTHWISECONV_PRELOAD_ROW(input_ptr, i) \
- preload_l1_keep(input_ptr + i * input_row_size + i0); \
- preload_l1_keep(input_ptr + i * input_row_size + i1); \
- preload_l1_keep(input_ptr + i * input_row_size + i2); \
- preload_l1_keep(input_ptr + i * input_row_size + i3); \
- preload_l1_keep(input_ptr + i * input_row_size + i4); \
- preload_l1_keep(input_ptr + i * input_row_size + i5); \
- preload_l1_keep(input_ptr + i * input_row_size + i6); \
- preload_l1_keep(input_ptr + i * input_row_size + i7); \
- preload_l1_keep(input_ptr + i * input_row_size + i8);
-
- int out_x = start_x;
- // 4x4 at a time.
- for (; out_x <= output_width - 4; out_x += 4) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- int depth = 0;
- for (; depth <= output_depth - 64; depth += 64) {
- // Preload 9x9 input.
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
-
- // For a large input window (64x9x9) that is small enough to fit in L1
- // cache, copy the input into a separate buffer and run the kernel on
- // this new buffer. This reduces the likelihood of cache misses when
- // the kernel is loading input data. If this size is ever changed,
- // update the ShuffleWorkspaceSize() function to return the new size.
- ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 9,
- 9, shuffle_workspace);
- const uint8* shuffled_ptr = &shuffle_workspace[0];
-
- for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
- ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
- shuffled_ptr, 64, input_offset, 64 * 9, filter_ptr, filter_offset,
- bias_ptr, output_offset, output_multiplier, output_shift,
- output_activation_min, output_activation_max, output_ptr,
- output_depth, output_width);
-
- shuffled_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
+ TFLITE_DCHECK(shuffle_params.input_height ==
+ get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
+ TFLITE_DCHECK(shuffle_params.input_width ==
+ get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
+ TFLITE_DCHECK(64 * shuffle_params.input_width * shuffle_params.input_height
+ <= DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
+
+ int32 out_x = start_x;
+
+ // Run shuffling on inputs with sufficiently large depth and width. When
+ // these parameters are large enough, more time is taken to load inputs
+ // from memory. At this point, it becomes useful to prefetch and
+ // preshuffle the input data to maximize locality.
+ if (params.output_depth > 64 ||
+ (params.output_depth <= 64 && params.input_width > 150)) {
+ for (; out_x <= (params.output_width - shuffle_params.output_width);
+ out_x += shuffle_params.output_width) {
+ const uint8* input_ptr = input_data;
+ const int32* bias_ptr = bias_data;
+ const uint8* filter_ptr = filter_data;
+ uint8* output_ptr = output_data;
+ int64_t depth = 0;
+ const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
+
+ for (; depth <= params.output_depth - 64; depth += 64) {
+ // Preload.
+ const uint8* h_ptr = input_ptr;
+ for (int32 i = 0; i < shuffle_params.input_height; i++) {
+ const uint8* ptr = h_ptr;
+ for (int32 j = 0; j < shuffle_params.input_width; j++) {
+ asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+ ptr += params.input_depth;
+ }
+ h_ptr += params.input_row_size;
+ }
+
+ // For a large enough input, shuffle into buckets.
+ ShuffleInput(input_ptr, params.input_depth, params.input_width,
+ params.input_height, 64, shuffle_params.input_width,
+ shuffle_params.input_height, shuffle_workspace);
+ ConvKernel::Run(shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
+ 0, 64, 64, shuffle_row_size,
+ shuffle_params.output_height,
+ shuffle_params.output_width, params);
+ input_ptr += 64;
+ output_ptr += 64;
+ filter_ptr += 64;
+ bias_ptr += 64;
}
- input_ptr += 64;
- }
- // Preload 9x9 input one more time for the rest of the depth.
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
- DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
-
- for (; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
- input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
-
- input_data += 4 * 2 * input_depth;
- output_data += 4 * output_depth;
- }
-
-#undef DEPTHWISECONV_PRELOAD_ROW
-
- // Handle the rest of the right side.
- // 4x2 at a time.
- for (; out_x <= output_width - 2; out_x += 2) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
- input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
+ // Preload.
+ const uint8* h_ptr = input_ptr;
+ for (int32 i = 0; i < shuffle_params.input_height; i++) {
+ const uint8* ptr = h_ptr;
+ for (int32 j = 0; j < shuffle_params.input_width; j++) {
+ asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+ ptr += params.input_depth;
+ }
+ h_ptr += params.input_row_size;
+ }
- input_data += 2 * 2 * input_depth;
- output_data += 2 * output_depth;
- }
+ // Handle leftover depth.
+ ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr,
+ depth, params.output_depth, params.input_depth,
+ params.input_row_size, shuffle_params.output_height,
+ shuffle_params.output_width, params);
- // 4x1 at a time.
- for (; out_x < output_width; out_x++) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- for (int depth = 0; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<4, 1, 2, 2>::Run(
- input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
+ input_data +=
+ shuffle_params.output_width * kStrideWidth * params.input_depth;
+ output_data += shuffle_params.output_width * params.output_depth;
}
-
- input_data += 2 * input_depth;
- output_data += output_depth;
}
- }
-};
-
-template <>
-struct ConvRow3x3FilterDepth8<8, 2, 2> {
- static inline void Run(const uint8* input_data, int start_x, int start_y,
- int input_depth, int input_width, int input_height,
- int input_row_size, int32 input_offset,
- const uint8* filter_data, int32 filter_offset,
- const int32* bias_data, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- int output_depth, int output_width,
- uint8* shuffle_workspace) {
- // Reuse 4 row kernels twice.
- ConvRow3x3FilterDepth8<4, 2, 2>::Run(
- input_data, start_x, start_y, input_depth, input_width, input_height,
- input_row_size, input_offset, filter_data, filter_offset, bias_data,
- output_offset, output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_data, output_depth, output_width,
- shuffle_workspace);
-
- ConvRow3x3FilterDepth8<4, 2, 2>::Run(
- input_data + 2 * 4 * input_row_size, start_x, start_y + 4, input_depth,
- input_width, input_height, input_row_size, input_offset, filter_data,
- filter_offset, bias_data, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_data + 4 * output_depth * output_width, output_depth,
- output_width, shuffle_workspace);
- }
-};
-
-template <>
-struct ConvRow3x3FilterDepth8<8, 1, 1> {
- // The buffer size of the shuffled input.
- static inline constexpr int ShuffleWorkspaceSize() { return 64 * 10 * 10; }
-
- static inline void Run(const uint8* input_data, int start_x, int start_y,
- int input_depth, int input_width, int input_height,
- int input_row_size, int32 input_offset,
- const uint8* filter_data, int32 filter_offset,
- const int32* bias_data, int32 output_offset,
- int32 output_multiplier, int output_shift,
- int32 output_activation_min,
- int32 output_activation_max, uint8* output_data,
- int output_depth, int output_width,
- uint8* shuffle_workspace) {
- int out_x = start_x;
- // 8x8 at a time.
- for (; out_x <= output_width - 8; out_x += 8) {
- const int32* bias_ptr = bias_data;
- const uint8* filter_ptr = filter_data;
-
- const uint8* input_ptr = input_data;
- uint8* output_ptr = output_data;
-
- int depth = 0;
- for (; depth <= output_depth - 64; depth += 64) {
- // For a large input window (64x10x10) that is small enough to fit in L1
- // cache, copy the input into a separate buffer and run the kernel on
- // this new buffer. This reduces the likelihood of cache misses when
- // the kernel is loading input data. If the size of the input window
- // changes, update the function ShuffleWorkspaceSize() with the new
- // size.
- ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 10,
- 10, shuffle_workspace);
- const uint8* shuffled_ptr = shuffle_workspace;
-
- for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
- ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
- shuffled_ptr, 64, input_offset, 64 * 10, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
-
- shuffled_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
- input_ptr += 64;
- }
-
- for (; depth <= output_depth - 8; depth += 8) {
- ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
- input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
- filter_offset, bias_ptr, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_ptr, output_depth, output_width);
-
- input_ptr += 8;
- output_ptr += 8;
- filter_ptr += 8;
- bias_ptr += 8;
- }
- input_data += 8 * input_depth;
- output_data += 8 * output_depth;
+ const int32 output_leftover_width = params.output_width - out_x;
+ if (output_leftover_width > 0) {
+ ConvKernel::Run(input_data, filter_data, bias_data, output_data, 0,
+ params.output_depth, params.input_depth,
+ params.input_row_size, shuffle_params.output_height,
+ output_leftover_width, params);
}
-
- // Handle the rest of the right side by re-using 4 row kernels twice.
- ConvRow3x3FilterDepth8<4, 1, 1>::Run(
- input_data, out_x, start_y, input_depth, input_width, input_height,
- input_row_size, input_offset, filter_data, filter_offset, bias_data,
- output_offset, output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_data, output_depth, output_width,
- shuffle_workspace);
-
- ConvRow3x3FilterDepth8<4, 1, 1>::Run(
- input_data + 4 * input_row_size, out_x, start_y + 4, input_depth,
- input_width, input_height, input_row_size, input_offset, filter_data,
- filter_offset, bias_data, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max,
- output_data + 4 * output_depth * output_width, output_depth,
- output_width, shuffle_workspace);
}
};
inline bool Fast3x3FilterKernelSupported(
- const Dims<4>& input_dims, const Dims<4>& filter_dims, int stride_width,
- int stride_height, int pad_width, int pad_height, int depth_multiplier,
- const Dims<4>& output_dims, int output_shift) {
- const int input_height = ArraySize(input_dims, 2);
- const int input_width = ArraySize(input_dims, 1);
- const int input_depth = ArraySize(input_dims, 0);
- const int filter_height = ArraySize(filter_dims, 2);
- const int filter_width = ArraySize(filter_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
+ const Dims<4>& input_dims, const Dims<4>& filter_dims, int32 stride_width,
+ int32 stride_height, int32 pad_width, int32 pad_height,
+ int32 depth_multiplier, const Dims<4>& output_dims, int32 output_shift) {
+ const int32 input_height = ArraySize(input_dims, 2);
+ const int32 input_width = ArraySize(input_dims, 1);
+ const int32 input_depth = ArraySize(input_dims, 0);
+ const int32 filter_height = ArraySize(filter_dims, 2);
+ const int32 filter_width = ArraySize(filter_dims, 1);
+ const int32 output_height = ArraySize(output_dims, 2);
+ const int32 output_width = ArraySize(output_dims, 1);
bool supported =
filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
// Handle case where padding is zero but padding type is not kValid.
// This would require special boundary case handling that is not supported.
- const int out_x = output_width - 1;
- const int out_y = output_height - 1;
+ const int32 out_x = output_width - 1;
+ const int32 out_y = output_height - 1;
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int32 in_x_origin = (out_x * stride_width) - pad_width;
+ const int32 in_y_origin = (out_y * stride_height) - pad_height;
- const int in_x_end = in_x_origin + filter_width;
- const int in_y_end = in_y_origin + filter_height;
+ const int32 in_x_end = in_x_origin + filter_width;
+ const int32 in_y_end = in_y_origin + filter_height;
// Supported only if filter on the right and bottom boundary lies completely
// within the input.
inline void DepthwiseConv3x3Filter(
const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
- const int32* bias_data, const Dims<4>& bias_dims, int stride_width,
- int stride_height, int pad_width, int pad_height, int depth_multiplier,
- int32 output_offset, int32 output_multiplier, int output_shift,
- int32 output_activation_min, int32 output_activation_max,
- uint8* output_data, const Dims<4>& output_dims) {
- const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
- const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
- const int input_height = ArraySize(input_dims, 2);
- const int input_width = ArraySize(input_dims, 1);
- const int input_depth = ArraySize(input_dims, 0);
- const int filter_height = ArraySize(filter_dims, 2);
- const int filter_width = ArraySize(filter_dims, 1);
- const int output_height = ArraySize(output_dims, 2);
- const int output_width = ArraySize(output_dims, 1);
-
- // Algorithm assumes below constraints. It is optimized for depth multiplier
- // of 1, 3x3 filter, no padding and strides 1 and 2.
- TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+ const int32* bias_data, const Dims<4>& bias_dims, int32 stride_width,
+ int32 stride_height, int32 pad_width, int32 pad_height,
+ int32 depth_multiplier, int32 output_offset, int32 output_multiplier,
+ int32 output_shift, int32 output_activation_min,
+ int32 output_activation_max, uint8* output_data,
+ const Dims<4>& output_dims) {
+ DepthwiseConvParams params;
+ params.input_depth = ArraySize(input_dims, 0);
+ params.input_width = ArraySize(input_dims, 1);
+ params.input_height = ArraySize(input_dims, 2);
+ params.input_row_size = params.input_depth * params.input_width;
+ params.input_offset = input_offset;
+ params.output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+ params.output_width = ArraySize(output_dims, 1);
+ params.output_height = ArraySize(output_dims, 2);
+ params.output_row_size = params.output_depth * params.output_width;
+ params.output_offset = output_offset;
+ params.filter_offset = filter_offset;
+ params.output_multiplier = output_multiplier;
+ params.output_shift = output_shift;
+ params.output_activation_min = output_activation_min;
+ params.output_activation_max = output_activation_max;
+
+ const int32 filter_height = ArraySize(filter_dims, 2);
+ const int32 filter_width = ArraySize(filter_dims, 1);
+
+ // Algorithm assumes below constraints. It is optimized for depth
+ // multiplier of 1, 3x3 filter, no padding and strides 1 and 2.
+ TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
TFLITE_DCHECK(depth_multiplier == 1);
TFLITE_DCHECK(filter_height == 3);
TFLITE_DCHECK(filter_width == 3);
- TFLITE_DCHECK(pad_height == 0);
- TFLITE_DCHECK(pad_width == 0);
TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
TFLITE_DCHECK(stride_width == stride_height);
+ TFLITE_DCHECK(pad_height == 0);
+ TFLITE_DCHECK(pad_width == 0);
- const int input_row_size = input_depth * (input_width + 2 * pad_width);
- const int output_row_size = output_depth * output_width;
- const int input_batch_size = input_row_size * (input_height + 2 * pad_height);
- const int output_batch_size = output_depth * output_width * output_height;
-
- using conv_row_func_t = decltype(&ConvRow3x3FilterDepth8<1, 1, 1>::Run);
- conv_row_func_t conv_1_output_row = ConvRow3x3FilterDepth8<1, 1, 1>::Run;
- conv_row_func_t conv_2_output_rows = ConvRow3x3FilterDepth8<2, 1, 1>::Run;
- conv_row_func_t conv_4_output_rows = ConvRow3x3FilterDepth8<4, 1, 1>::Run;
- conv_row_func_t conv_8_output_rows = ConvRow3x3FilterDepth8<8, 1, 1>::Run;
+ const int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+ const int64_t input_batch_size = params.input_row_size * params.input_height;
+ const int64_t output_batch_size =
+ params.output_row_size * params.output_height;
+
+ ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
+ four_row_shuffle_params, eight_row_shuffle_params;
+ if (stride_width == 1) {
+ one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
+ two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
+ four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
+ eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
+ } else {
+ one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
+ two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
+ four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
+ eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
+ }
+ using conv_multirow_func_t = decltype(&DepthwiseConvMultiRow<1, 1>::Run);
+ conv_multirow_func_t conv_multirow_func = DepthwiseConvMultiRow<1, 1>::Run;
if (stride_width == 2) {
- conv_1_output_row = ConvRow3x3FilterDepth8<1, 2, 2>::Run;
- conv_2_output_rows = ConvRow3x3FilterDepth8<2, 2, 2>::Run;
- conv_4_output_rows = ConvRow3x3FilterDepth8<4, 2, 2>::Run;
- conv_8_output_rows = ConvRow3x3FilterDepth8<8, 2, 2>::Run;
+ conv_multirow_func = DepthwiseConvMultiRow<2, 2>::Run;
}
// Allocate maximum memory needed for shuffled input.
// TODO(mariewhite): The size of this workspace is small enough to be
// allocated on the stack. Eventually we will want to move it to the heap
- // and have it allocated outside of this function, like the im2col_array used
- // in gemmlowp.
-#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
+ // and have it allocated outside of this function, like the im2col_array
+ // used in gemmlowp.
uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
- // Make sure the kernels using this buffer will not run out of bounds.
- static_assert(ConvRow3x3FilterDepth8<8, 1, 1>::ShuffleWorkspaceSize() <=
- DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
- "Shuffle workspace size is too small.");
- static_assert(ConvRow3x3FilterDepth8<4, 2, 2>::ShuffleWorkspaceSize() <=
- DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
- "Shuffle workspace size is too small.");
-
-#undef DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE
-
- for (int b = 0; b < batches; ++b) {
+ for (int32 b = 0; b < batches; ++b) {
const uint8* input_ptr = input_data + b * input_batch_size;
uint8* output_ptr = output_data + b * output_batch_size;
- int out_y = 0;
+ int32 out_y = 0;
+
+ // Shuffling shapes that maximize width over the shuffle workspace size
+ // perform better since the inputs are closer together, minimizing
+ // shuffling time.
+ //
+ // If the input shape has width large enough for the 2 row kernels,
+ // we prefer to use this. The innermost loop of the kernels handle
+ // 2 height x 2 width so this is the fastest path.
+ //
+ // If the input shape has smaller width but larger height, shuffling is
+ // still useful and can benefit from kernels 4 row and 8 row kernels.
// Handle 8 rows at a time.
- for (; out_y <= output_height - 8; out_y += 8) {
- conv_8_output_rows(input_ptr, 0, out_y, input_depth, input_width,
- input_height, input_row_size, input_offset,
- filter_data, filter_offset, bias_data, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth,
- output_width, shuffle_workspace);
-
- input_ptr += 8 * stride_height * input_row_size;
- output_ptr += 8 * output_row_size;
+ if (params.input_width < four_row_shuffle_params.input_width) {
+ for (; out_y <= params.output_height - 8; out_y += 8) {
+ conv_multirow_func(input_ptr, 0, out_y, filter_data, bias_data,
+ output_ptr, params, eight_row_shuffle_params,
+ shuffle_workspace);
+ input_ptr += 8 * stride_height * params.input_row_size;
+ output_ptr += 8 * params.output_row_size;
+ }
}
// Handle 4 rows at a time.
- for (; out_y <= output_height - 4; out_y += 4) {
- conv_4_output_rows(input_ptr, 0, out_y, input_depth, input_width,
- input_height, input_row_size, input_offset,
- filter_data, filter_offset, bias_data, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth,
- output_width, shuffle_workspace);
-
- input_ptr += 4 * stride_height * input_row_size;
- output_ptr += 4 * output_row_size;
+ if (params.input_width < two_row_shuffle_params.input_width) {
+ for (; out_y <= params.output_height - 4; out_y += 4) {
+ conv_multirow_func(input_ptr, 0, out_y, filter_data, bias_data,
+ output_ptr, params, four_row_shuffle_params,
+ shuffle_workspace);
+ input_ptr += 4 * stride_height * params.input_row_size;
+ output_ptr += 4 * params.output_row_size;
+ }
}
// Handle 2 rows at a time.
- for (; out_y <= output_height - 2; out_y += 2) {
- conv_2_output_rows(input_ptr, 0, out_y, input_depth, input_width,
- input_height, input_row_size, input_offset,
- filter_data, filter_offset, bias_data, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth,
- output_width, shuffle_workspace);
-
- input_ptr += 2 * stride_height * input_row_size;
- output_ptr += 2 * output_row_size;
+ for (; out_y <= params.output_height - 2; out_y += 2) {
+ conv_multirow_func(input_ptr, 0, out_y, filter_data, bias_data,
+ output_ptr, params, two_row_shuffle_params,
+ shuffle_workspace);
+ input_ptr += 2 * stride_height * params.input_row_size;
+ output_ptr += 2 * params.output_row_size;
}
// Handle one row at a time.
- for (; out_y < output_height; out_y++) {
- conv_1_output_row(input_ptr, 0, out_y, input_depth, input_width,
- input_height, input_row_size, input_offset, filter_data,
- filter_offset, bias_data, output_offset,
- output_multiplier, output_shift, output_activation_min,
- output_activation_max, output_ptr, output_depth,
- output_width, shuffle_workspace);
-
- input_ptr += stride_height * input_row_size;
- output_ptr += output_row_size;
+ for (; out_y < params.output_height; out_y++) {
+ conv_multirow_func(input_ptr, 0, out_y, filter_data, bias_data,
+ output_ptr, params, one_row_shuffle_params,
+ shuffle_workspace);
+ input_ptr += stride_height * params.input_row_size;
+ output_ptr += params.output_row_size;
}
}
}
+// clang-format on
#endif // __aarch64__