DepthwiseConv Optimizations

author A. Unique TensorFlower <gardener@tensorflow.org>

Wed, 11 Apr 2018 20:29:12 +0000 (13:29 -0700)

committer TensorFlower Gardener <gardener@tensorflow.org>

Wed, 11 Apr 2018 20:31:29 +0000 (13:31 -0700)
author A. Unique TensorFlower <gardener@tensorflow.org>
Wed, 11 Apr 2018 20:29:12 +0000 (13:29 -0700)
committer TensorFlower Gardener <gardener@tensorflow.org>
Wed, 11 Apr 2018 20:31:29 +0000 (13:31 -0700)
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h

index 0f78e0f..dd6932f 100644 (file)
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1696,15 +1696,15 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
  #ifdef __aarch64__
    // Call kernel optimized for depthwise convolutions using 3x3 filters if
    // parameters are supported.
-  if (Fast3by3FilterKernelSupported(input_dims, filter_dims, stride_width,
-                                    stride_height, pad_width, pad_height,
-                                    depth_multiplier, output_dims)) {
-    DepthwiseConv3by3FilterDepth16(
-        input_data, input_dims, input_offset, filter_data, filter_dims,
-        filter_offset, bias_data, bias_dims, stride_width, stride_height,
-        pad_width, pad_height, depth_multiplier, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_data, output_dims);
+  if (Fast3x3FilterKernelSupported(input_dims, filter_dims, stride_width,
+                                   stride_height, pad_width, pad_height,
+                                   depth_multiplier, output_dims)) {
+    DepthwiseConv3x3Filter(input_data, input_dims, input_offset, filter_data,
+                           filter_dims, filter_offset, bias_data, bias_dims,
+                           stride_width, stride_height, pad_width, pad_height,
+                           depth_multiplier, output_offset, output_multiplier,
+                           output_shift, output_activation_min,
+                           output_activation_max, output_data, output_dims);
      return;
    }
  #endif
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h

index a349892..cdcb166 100644 (file)
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -40,412 +40,4380 @@ inline void preload_l1_keep(const uint8* ptr) {
  // NEON intrinsics vector data types.
  // See: https://bugs.llvm.org/show_bug.cgi?id=34945
  
-struct Int32x16 {
-  int32x4_t v0, v1, v2, v3;
+struct Int32x8 {
+  int32x4_t low, high;
  };
  
-struct Int16x16 {
-  int16x8_t low, high;
+struct Filter3x3x8 {
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7, f8;
  };
  
-struct Int16x16x3 {
-  Int16x16 v0, v1, v2;
+// Loads 3x3 filter of depth 8 and adds filter offsets.
+inline Filter3x3x8 Load3x3Filter(const uint8* filter_ptr, int32 filter_offset,
+                                 int output_depth) {
+  Filter3x3x8 filter;
+
+  uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5,
+      temp_u8_6, temp_u8_7, temp_u8_8;
+  int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+
+  temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth);
+  temp_u8_1 = vld1_u8(filter_ptr + 1 * output_depth);
+  temp_u8_2 = vld1_u8(filter_ptr + 2 * output_depth);
+  temp_u8_3 = vld1_u8(filter_ptr + 3 * output_depth);
+  temp_u8_4 = vld1_u8(filter_ptr + 4 * output_depth);
+  temp_u8_5 = vld1_u8(filter_ptr + 5 * output_depth);
+  temp_u8_6 = vld1_u8(filter_ptr + 6 * output_depth);
+  temp_u8_7 = vld1_u8(filter_ptr + 7 * output_depth);
+  temp_u8_8 = vld1_u8(filter_ptr + 8 * output_depth);
+
+  filter.f0 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0));
+  filter.f1 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1));
+  filter.f2 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2));
+  filter.f3 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3));
+  filter.f4 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4));
+  filter.f5 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5));
+  filter.f6 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6));
+  filter.f7 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7));
+  filter.f8 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8));
+
+  filter.f0 = vaddq_s16(filter.f0, filter_offset_vec);
+  filter.f1 = vaddq_s16(filter.f1, filter_offset_vec);
+  filter.f2 = vaddq_s16(filter.f2, filter_offset_vec);
+  filter.f3 = vaddq_s16(filter.f3, filter_offset_vec);
+  filter.f4 = vaddq_s16(filter.f4, filter_offset_vec);
+  filter.f5 = vaddq_s16(filter.f5, filter_offset_vec);
+  filter.f6 = vaddq_s16(filter.f6, filter_offset_vec);
+  filter.f7 = vaddq_s16(filter.f7, filter_offset_vec);
+  filter.f8 = vaddq_s16(filter.f8, filter_offset_vec);
+
+  return filter;
+}
+
+// Applies activation, offset and downquantize on a set of accumulator
+// registers that correspond to a 2x2 output of depth 8.
+// Stores results to output.
+inline void DownquantizeAndStore2x2Output(
+    Int32x8 acc_0, Int32x8 acc_1, Int32x8 acc_2, Int32x8 acc_3,
+    int32 output_offset, int32 output_multiplier, int output_shift,
+    int32 output_activation_min, int32 output_activation_max, uint8* output_ptr,
+    int output_depth, int output_width) {
+  using gemmlowp::RoundingDivideByPOT;
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+  const int32x4_t output_activation_min_vec =
+      vdupq_n_s32(output_activation_min);
+  const int32x4_t output_activation_max_vec =
+      vdupq_n_s32(output_activation_max);
+
+  // Fixed-point multiplication.
+  acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
+  acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
+  acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
+  acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
+  acc_2.low = vqrdmulhq_n_s32(acc_2.low, output_multiplier);
+  acc_2.high = vqrdmulhq_n_s32(acc_2.high, output_multiplier);
+  acc_3.low = vqrdmulhq_n_s32(acc_3.low, output_multiplier);
+  acc_3.high = vqrdmulhq_n_s32(acc_3.high, output_multiplier);
+
+  acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
+  acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
+  acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
+  acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
+  acc_2.low = RoundingDivideByPOT(acc_2.low, output_shift);
+  acc_2.high = RoundingDivideByPOT(acc_2.high, output_shift);
+  acc_3.low = RoundingDivideByPOT(acc_3.low, output_shift);
+  acc_3.high = RoundingDivideByPOT(acc_3.high, output_shift);
+
+  // Add the output offset.
+  acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
+  acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
+  acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
+  acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
+  acc_2.low = vaddq_s32(acc_2.low, output_offset_vec);
+  acc_2.high = vaddq_s32(acc_2.high, output_offset_vec);
+  acc_3.low = vaddq_s32(acc_3.low, output_offset_vec);
+  acc_3.high = vaddq_s32(acc_3.high, output_offset_vec);
+
+  // Apply the activation function.
+  acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
+  acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
+  acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
+  acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
+  acc_2.low = vmaxq_s32(acc_2.low, output_activation_min_vec);
+  acc_2.high = vmaxq_s32(acc_2.high, output_activation_min_vec);
+  acc_3.low = vmaxq_s32(acc_3.low, output_activation_min_vec);
+  acc_3.high = vmaxq_s32(acc_3.high, output_activation_min_vec);
+
+  acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
+  acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
+  acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
+  acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
+  acc_2.low = vminq_s32(acc_2.low, output_activation_max_vec);
+  acc_2.high = vminq_s32(acc_2.high, output_activation_max_vec);
+  acc_3.low = vminq_s32(acc_3.low, output_activation_max_vec);
+  acc_3.high = vminq_s32(acc_3.high, output_activation_max_vec);
+
+  // Saturating cast to uint8 and store to destination.
+  int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
+  int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
+  int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
+  int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
+  int16x4_t acc_2_low_s16 = vqmovn_s32(acc_2.low);
+  int16x4_t acc_2_high_s16 = vqmovn_s32(acc_2.high);
+  int16x4_t acc_3_low_s16 = vqmovn_s32(acc_3.low);
+  int16x4_t acc_3_high_s16 = vqmovn_s32(acc_3.high);
+
+  int16x8_t res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
+  int16x8_t res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
+  int16x8_t res_2_s16 = vcombine_s16(acc_2_low_s16, acc_2_high_s16);
+  int16x8_t res_3_s16 = vcombine_s16(acc_3_low_s16, acc_3_high_s16);
+
+  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
+  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
+  uint8x8_t res_2_u8 = vqmovun_s16(res_2_s16);
+  uint8x8_t res_3_u8 = vqmovun_s16(res_3_s16);
+
+  vst1_u8(output_ptr, res_0_u8);
+  vst1_u8(output_ptr + output_depth, res_1_u8);
+  vst1_u8(output_ptr + output_depth * output_width, res_2_u8);
+  vst1_u8(output_ptr + output_depth * output_width + output_depth, res_3_u8);
+}
+
+inline void DownquantizeAndStore(Int32x8 acc, int32 output_offset,
+                                 int32 output_multiplier, int output_shift,
+                                 int32 output_activation_min,
+                                 int32 output_activation_max,
+                                 uint8* output_ptr) {
+  using gemmlowp::RoundingDivideByPOT;
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+  const int32x4_t output_activation_min_vec =
+      vdupq_n_s32(output_activation_min);
+  const int32x4_t output_activation_max_vec =
+      vdupq_n_s32(output_activation_max);
+
+  acc.low = vqrdmulhq_n_s32(acc.low, output_multiplier);
+  acc.high = vqrdmulhq_n_s32(acc.high, output_multiplier);
+
+  acc.low = RoundingDivideByPOT(acc.low, output_shift);
+  acc.high = RoundingDivideByPOT(acc.high, output_shift);
+
+  acc.low = vaddq_s32(acc.low, output_offset_vec);
+  acc.high = vaddq_s32(acc.high, output_offset_vec);
+
+  acc.low = vmaxq_s32(acc.low, output_activation_min_vec);
+  acc.high = vmaxq_s32(acc.high, output_activation_min_vec);
+
+  acc.low = vminq_s32(acc.low, output_activation_max_vec);
+  acc.high = vminq_s32(acc.high, output_activation_max_vec);
+
+  int16x4_t acc_low_s16 = vqmovn_s32(acc.low);
+  int16x4_t acc_high_s16 = vqmovn_s32(acc.high);
+
+  int16x8_t res_s16 = vcombine_s16(acc_low_s16, acc_high_s16);
+  uint8x8_t res_u8 = vqmovun_s16(res_s16);
+  vst1_u8(output_ptr, res_u8);
+}
+
+inline void DownquantizeAndStore2Output(
+    Int32x8 acc_0, Int32x8 acc_1, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  {
+    using gemmlowp::RoundingDivideByPOT;
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    const int32x4_t output_activation_min_vec =
+        vdupq_n_s32(output_activation_min);
+    const int32x4_t output_activation_max_vec =
+        vdupq_n_s32(output_activation_max);
+
+    // Fixed-point multiplication.
+    acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
+    acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
+    acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
+    acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
+
+    acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
+    acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
+    acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
+    acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
+
+    // Add the output offset.
+    acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
+    acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
+    acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
+    acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
+
+    // Apply the activation function.
+    acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
+    acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
+    acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
+    acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
+
+    acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
+    acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
+    acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
+    acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
+  }
+
+  // Saturating cast to uint8 and store to destination.
+  int16x8_t res_0_s16;
+  {
+    int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
+    int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
+    res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
+  }
+
+  int16x8_t res_1_s16;
+  {
+    int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
+    int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
+    res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
+  }
+
+  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
+  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
+  vst1_u8(output_ptr, res_0_u8);
+  vst1_u8(output_ptr + output_ptr_offset, res_1_u8);
+}
+
+// Performs multiply accumulate on 3 inputs of depth 8.
+inline Int32x8 MultiplyAccumulateRow(Int32x8 accum, int16x8_t f0, int16x8_t f1,
+                                     int16x8_t f2, int16x8_t i0, int16x8_t i1,
+                                     int16x8_t i2) {
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f0), vget_low_s16(i0));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f0), vget_high_s16(i0));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f1), vget_low_s16(i1));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f1), vget_high_s16(i1));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f2), vget_low_s16(i2));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f2), vget_high_s16(i2));
+  return accum;
+}
+
+// Performs multiply accumulate on 3 inputs of depth 8.
+inline Int32x8 MultiplyAccumulate3x3Filter(const Filter3x3x8& f, int16x8_t i0,
+                                           int16x8_t i1, int16x8_t i2,
+                                           int16x8_t i3, int16x8_t i4,
+                                           int16x8_t i5, int16x8_t i6,
+                                           int16x8_t i7, int16x8_t i8,
+                                           Int32x8 accum) {
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f0), vget_low_s16(i0));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f0), vget_high_s16(i0));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f1), vget_low_s16(i1));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f1), vget_high_s16(i1));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f2), vget_low_s16(i2));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f2), vget_high_s16(i2));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f3), vget_low_s16(i3));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f3), vget_high_s16(i3));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f4), vget_low_s16(i4));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f4), vget_high_s16(i4));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f5), vget_low_s16(i5));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f5), vget_high_s16(i5));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f6), vget_low_s16(i6));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f6), vget_high_s16(i6));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f7), vget_low_s16(i7));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f7), vget_high_s16(i7));
+  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f8), vget_low_s16(i8));
+  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f8), vget_high_s16(i8));
+  return accum;
+}
+
+inline void DotProductAndStore(const Filter3x3x8& filter, int16x8_t i0,
+                               int16x8_t i1, int16x8_t i2, int16x8_t i3,
+                               int16x8_t i4, int16x8_t i5, int16x8_t i6,
+                               int16x8_t i7, int16x8_t i8,
+                               const int32* bias_ptr, int32 output_offset,
+                               int32 output_multiplier, int output_shift,
+                               int32 output_activation_min,
+                               int32 output_activation_max, uint8* output_ptr) {
+  Int32x8 acc;
+  acc.low = vld1q_s32(bias_ptr);
+  acc.high = vld1q_s32(bias_ptr + 4);
+
+  acc = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7, i8,
+                                    acc);
+
+  DownquantizeAndStore(acc, output_offset, output_multiplier, output_shift,
+                       output_activation_min, output_activation_max,
+                       output_ptr);
+}
+
+// Performs multiply-accumulate on a 3x4 input for 2 horizontal outputs.
+inline void DotProductAndStore2xStride1(
+    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
+    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
+    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
+    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  Int32x8 acc_0, acc_1;
+  acc_0.low = vld1q_s32(bias_ptr);
+  acc_1.low = vld1q_s32(bias_ptr);
+  acc_0.high = vld1q_s32(bias_ptr + 4);
+  acc_1.high = vld1q_s32(bias_ptr + 4);
+
+  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i4, i5, i6, i8, i9,
+                                      i10, acc_0);
+  acc_1 = MultiplyAccumulate3x3Filter(filter, i1, i2, i3, i5, i6, i7, i9, i10,
+                                      i11, acc_1);
+  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
+                              output_shift, output_activation_min,
+                              output_activation_max, output_ptr,
+                              output_ptr_offset);
+}
+
+// Performs multiply-accumulate on a 4x3 input for 2 vertical outputs.
+inline void DotProductAndStore2yStride1(
+    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
+    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
+    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
+    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_ptr, int output_ptr_offset) {
+  Int32x8 acc_0, acc_1;
+  acc_0.low = vld1q_s32(bias_ptr);
+  acc_1.low = vld1q_s32(bias_ptr);
+  acc_0.high = vld1q_s32(bias_ptr + 4);
+  acc_1.high = vld1q_s32(bias_ptr + 4);
+
+  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7,
+                                      i8, acc_0);
+  acc_1 = MultiplyAccumulate3x3Filter(filter, i3, i4, i5, i6, i7, i8, i9, i10,
+                                      i11, acc_1);
+  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
+                              output_shift, output_activation_min,
+                              output_activation_max, output_ptr,
+                              output_ptr_offset);
+}
+
+// A kernel that is optimized on the number of output cells in the x and y
+// direction, and the stride. Assumes 3x3 filters of 16 depth.
+template <int kFixedOutputY, int kFixedOutputX, int kFixedStride = 1>
+struct ConvKernel3x3FilterDepth8 {};
+
+template <>
+struct ConvKernel3x3FilterDepth8<8, 8, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    // To process 8x8 outputs using a 3x3 filter, we require 10x10 inputs.
+    // Load inputs for the first 2 filters on the top left, then slide to
+    // the right, down, left, down, right, etc. in a snake-like path. This
+    // minimizes the total number of loads.
+    //
+    //        INPUT                          OUTPUT
+    //   |\----------------\               |\------------\
+    //   | \                \              | \            \
+    //   |  \----------------\             |  \------------\
+    //   |  | 0    ...     9 |             |  | 0  ...   7 |
+    //   |  | 10   ...    19 |     --->    |  | 8  ...  15 |
+    //   |  | 20   ...    29 |              \ | .. ...  .. |
+    //    \ | ..   ...    .. |               \| 56 ...  63 |
+    //     \| 90   ...   109 |                |------------|
+    //      |----------------|
+    //
+    // The first set of loads corresponds to:
+    //
+    //        INPUT                          OUTPUT
+    //   |\-----------------                |\-----------
+    //   | \                                | \
+    //   |  \-----------------              |  \----------
+    //   |  | 0  1   2  3 ...               |  | 0  1 ...
+    //   |  | 10 11 12 13 ...     --->      |  | ..   ...
+    //   |  | 20 21 22 23 ...                  | ..   ...
+    //   |  | ..   ...    ...
+    //
+    // The next set of loads correspond to a sliding window to the right.
+    // It loads inputs 4, 5, 14, 15, 23, 24 and keeps 2, 3, 12, 13, and 22:
+    //
+    //        INPUT                          OUTPUT
+    //   |\-------------------                |\-------------
+    //   | \                                  | \
+    //   |  \-------------------              |  \------------
+    //   |  | .. 2  3   4  5 ...              |  | .. 2  3 ...
+    //   |  | .. 12 13 14 15 ...     --->     |  | ..      ...
+    //   |  | .. 21 22 23 24 ...                 | ..      ...
+    //   |  | ..    ...      ...
+    //
+    // And so on...
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (0) and (1).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Slide to the right for outputs x = [2, 3], y = 0. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (2) and (3).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Slide to the right again for outputs x = [4, 5], y = 0. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (4) and (5).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_depth, output_depth);
+
+    // Slide to the right one last time for outputs x = [6, 7], y = 0.
+    // Referring to the indexes in the diagram above, this corresponds to
+    // outputs (6) and (7).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_depth, output_depth);
+
+    // Slide to down for outputs x = [6, 7], y = 1. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (14) and (15).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 1. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (12) and (13).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left again for outputs x = [2, 3], y = 1. Referring to the indexes
+    // in the diagram above, this corresponds to outputs (10) and (11).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 1. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (8) and (9).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (16) and (17).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (18) and (19).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (20) and (21).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 2. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (22) and (23).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 2 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (30) and (31).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (28) and (29).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 3. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (26) and (27).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 3. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (24) and (25).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (32) and (33).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 4 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (34) and (35).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 4. Referring to the indexes in
+    // the diagram above, this corresponds to outputs (36) and (37).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 4. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (38) and (39).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 4 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (46) and (47).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (44) and (45).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 5. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (42) and (43).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 5 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 5. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (40) and (41).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 5 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [0, 1], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (48) and (49).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 8 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 6 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [2, 3], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (50) and (51).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide right for outputs x = [4, 5], y = 6. Referring to the  indexes in
+    // the diagram above, this corresponds to outputs (52) and (53).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide right one more time for outputs x = [6, 7], y = 6. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (54) and (55).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 8 * input_depth + 6 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 6 * output_row_size, output_depth);
+
+    // Slide down for outputs x = [6, 7], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (62) and (63).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 6 * input_depth + 9 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 6 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [4, 5], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (60) and (61).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 4 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left for outputs x = [2, 3], y = 7. Referring to the indexes in the
+    // diagram above, this corresponds to outputs (58) and (59).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 7 * output_row_size, output_depth);
+
+    // Slide left one more time for outputs x = [0, 1], y = 7. Referring to the
+    // indexes in the diagram above, this corresponds to outputs (56) and (57).
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 7 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 7 * output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 4, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    // To process 4x4 outputs using a 3x3 filter, we require 6x6 inputs.
+    // Load inputs for the first 2 filters on the top left, then slide to
+    // the right, down, left, down, right, etc. in a snake-like path. This
+    // minimizes the total number of loads.
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the top right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
+        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max,
+        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 2, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load next inputs one row down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load next row.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
+        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Now load last row.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 5 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 2x1 outputs starting from the top.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_row_size);
+
+    // Load inputs for bottom 2 rows.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_6, input_7, input_8, input_9, input_10, input_11, input_0,
+        input_1, input_2, input_3, input_4, input_5, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_row_size,
+        output_row_size);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 2, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1, acc_2, acc_3;
+
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_2.low = vld1q_s32(bias_ptr);
+    acc_3.low = vld1q_s32(bias_ptr);
+
+    bias_ptr += 4;
+    acc_0.high = vld1q_s32(bias_ptr);
+    acc_1.high = vld1q_s32(bias_ptr);
+    acc_2.high = vld1q_s32(bias_ptr);
+    acc_3.high = vld1q_s32(bias_ptr);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    // Add scope for input registers to help the compiler know that it is
+    // not needed.
+    {
+      // To process 2x2 outputs using a 3x3 filter, we require 4x4 inputs.
+      // Load inputs for the top two filters first.
+      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+          input_7, input_8, input_9, input_10, input_11;
+
+      const uint8* ptr = input_ptr;
+
+      // Load top 3 rows.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+        input_10 = vaddq_s16(input_10, input_offset_vec);
+        input_11 = vaddq_s16(input_11, input_offset_vec);
+      }
+
+      // Multiply-accum for top-left output.
+      acc_0 = MultiplyAccumulate3x3Filter(filter, input_0, input_1, input_2,
+                                          input_4, input_5, input_6, input_8,
+                                          input_9, input_10, acc_0);
+
+      // Multiply-accum for top-right output.
+      acc_1 = MultiplyAccumulate3x3Filter(filter, input_1, input_2, input_3,
+                                          input_5, input_6, input_7, input_9,
+                                          input_10, input_11, acc_1);
+
+      // Now load the bottom row.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+      }
+
+      // Multiply-accum for bottom-left output.
+      acc_2 = MultiplyAccumulate3x3Filter(filter, input_4, input_5, input_6,
+                                          input_8, input_9, input_10, input_0,
+                                          input_1, input_2, acc_2);
+
+      // Multiply-accum for bottom-right output.
+      acc_3 = MultiplyAccumulate3x3Filter(filter, input_5, input_6, input_7,
+                                          input_9, input_10, input_11, input_1,
+                                          input_2, input_3, acc_3);
+    }
+
+    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
+                                  output_multiplier, output_shift,
+                                  output_activation_min, output_activation_max,
+                                  output_ptr, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 4, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int output_row_size = output_depth * output_width;
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the top right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + 4 * input_depth;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+
+    // Now load next inputs when sliding window down.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
+        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
+        output_depth);
+
+    // Now load next inputs when sliding window left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
+        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + output_row_size, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 4, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the left.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      temp_3 = vld1_u8(ptr + 3 * input_depth);
+
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth);
+
+    // Now load 1x2 inputs on the right.
+    {
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr + input_depth * 4;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_2 = vld1_u8(ptr);
+      temp_3 = vld1_u8(ptr + input_depth);
+
+      ptr += input_row_size;
+      temp_4 = vld1_u8(ptr);
+      temp_5 = vld1_u8(ptr + input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+    }
+
+    DotProductAndStore2xStride1(
+        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
+        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr + 2 * output_depth, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    // To process 2x1 outputs using a 3x3 filter, we require 4x3 inputs.
+    // Load all inputs at the beginning.
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11;
+
+    // Load inputs for 1x2 outputs starting from the top left.
+    {
+      const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
+
+      const uint8* ptr = input_ptr;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_0 = vaddq_s16(input_0, input_offset_vec);
+      input_1 = vaddq_s16(input_1, input_offset_vec);
+      input_2 = vaddq_s16(input_2, input_offset_vec);
+      input_3 = vaddq_s16(input_3, input_offset_vec);
+      input_4 = vaddq_s16(input_4, input_offset_vec);
+      input_5 = vaddq_s16(input_5, input_offset_vec);
+
+      ptr += input_row_size;
+      temp_0 = vld1_u8(ptr);
+      temp_1 = vld1_u8(ptr + input_depth);
+      temp_2 = vld1_u8(ptr + 2 * input_depth);
+      ptr += input_row_size;
+      temp_3 = vld1_u8(ptr);
+      temp_4 = vld1_u8(ptr + input_depth);
+      temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+      input_6 = vaddq_s16(input_6, input_offset_vec);
+      input_7 = vaddq_s16(input_7, input_offset_vec);
+      input_8 = vaddq_s16(input_8, input_offset_vec);
+      input_9 = vaddq_s16(input_9, input_offset_vec);
+      input_10 = vaddq_s16(input_10, input_offset_vec);
+      input_11 = vaddq_s16(input_11, input_offset_vec);
+    }
+
+    DotProductAndStore2yStride1(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_ptr, output_depth * output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
+
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1;
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, input_9;
+
+    const uint8* ptr = input_ptr;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+    // Load first 2 rows.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load next 2 rows.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_9 = vaddq_s16(input_9, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+
+    output_ptr += output_row_size;
+
+    // Moving onto the next row of outputs.
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_0.high = vld1q_s32(bias_ptr + 4);
+    acc_1.high = vld1q_s32(bias_ptr + 4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                  input_2, input_3, input_4);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                  input_5, input_6, input_7);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                  input_7, input_8, input_9);
+
+    // Load last row.
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    temp_3 = vld1_u8(ptr + 3 * input_depth);
+    temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+
+    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                  input_0, input_1, input_2);
+
+    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                  input_2, input_3, input_4);
+
+    DownquantizeAndStore2Output(
+        acc_0, acc_1, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 4, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    // Reuse 4x2 kernel twice.
+    ConvKernel3x3FilterDepth8<4, 2, 2>::Run(
+        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth,
+        output_width);
+
+    ConvKernel3x3FilterDepth8<4, 2, 2>::Run(
+        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
+        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_ptr + 2 * output_depth, output_depth, output_width);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<4, 1, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
+
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
+        input_4, input_5, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Third output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_3, input_4, input_5, input_6, input_7, input_8, input_0,
+        input_1, input_2, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Fourth output.
+    output_ptr += output_row_size;
+
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<2, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    Int32x8 acc_0, acc_1, acc_2, acc_3;
+    acc_0.low = vld1q_s32(bias_ptr);
+    acc_1.low = vld1q_s32(bias_ptr);
+    acc_2.low = vld1q_s32(bias_ptr);
+    acc_3.low = vld1q_s32(bias_ptr);
+
+    bias_ptr += 4;
+    acc_0.high = vld1q_s32(bias_ptr);
+    acc_1.high = vld1q_s32(bias_ptr);
+    acc_2.high = vld1q_s32(bias_ptr);
+    acc_3.high = vld1q_s32(bias_ptr);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+
+    // Add scope for input registers to help the compiler know that it is
+    // not needed.
+    {
+      // To process 2x2 outputs using a 3x3 filter at stride 2, we require
+      // 5x5 inputs. We load the first 5x2 inputs at a time.
+      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+          input_7, input_8, input_9;
+
+      const uint8* ptr = input_ptr;
+
+      // Load inputs.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+      }
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
+                                    input_0, input_1, input_2);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
+                                    input_2, input_3, input_4);
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
+                                    input_5, input_6, input_7);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
+                                    input_7, input_8, input_9);
+
+      // Load next inputs.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_5 = vaddq_s16(input_5, input_offset_vec);
+        input_6 = vaddq_s16(input_6, input_offset_vec);
+        input_7 = vaddq_s16(input_7, input_offset_vec);
+        input_8 = vaddq_s16(input_8, input_offset_vec);
+        input_9 = vaddq_s16(input_9, input_offset_vec);
+      }
+
+      acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
+                                    input_0, input_1, input_2);
+
+      acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
+                                    input_2, input_3, input_4);
+
+      // Moving onto the two bottom outputs.
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f0, filter.f1, filter.f2,
+                                    input_0, input_1, input_2);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f0, filter.f1, filter.f2,
+                                    input_2, input_3, input_4);
+
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f3, filter.f4, filter.f5,
+                                    input_5, input_6, input_7);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f3, filter.f4, filter.f5,
+                                    input_7, input_8, input_9);
+
+      // Load last input row.
+      {
+        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+
+        ptr += input_row_size;
+        temp_0 = vld1_u8(ptr);
+        temp_1 = vld1_u8(ptr + input_depth);
+        temp_2 = vld1_u8(ptr + 2 * input_depth);
+        temp_3 = vld1_u8(ptr + 3 * input_depth);
+        temp_4 = vld1_u8(ptr + 4 * input_depth);
+
+        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+
+        input_0 = vaddq_s16(input_0, input_offset_vec);
+        input_1 = vaddq_s16(input_1, input_offset_vec);
+        input_2 = vaddq_s16(input_2, input_offset_vec);
+        input_3 = vaddq_s16(input_3, input_offset_vec);
+        input_4 = vaddq_s16(input_4, input_offset_vec);
+      }
+
+      acc_2 = MultiplyAccumulateRow(acc_2, filter.f6, filter.f7, filter.f8,
+                                    input_0, input_1, input_2);
+
+      acc_3 = MultiplyAccumulateRow(acc_3, filter.f6, filter.f7, filter.f8,
+                                    input_2, input_3, input_4);
+    }
+
+    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
+                                  output_multiplier, output_shift,
+                                  output_activation_min, output_activation_max,
+                                  output_ptr, output_depth, output_width);
+  }
  };
  
-struct Filter3x3x16 {
-  Int16x16x3 r0, r1, r2;
+template <>
+struct ConvKernel3x3FilterDepth8<2, 4, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    // Reuse 2x2 kernel twice.
+    ConvKernel3x3FilterDepth8<2, 2, 2>::Run(
+        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
+        output_activation_min, output_activation_max, output_ptr, output_depth,
+        output_width);
+
+    ConvKernel3x3FilterDepth8<2, 2, 2>::Run(
+        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
+        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_ptr + 2 * output_depth, output_depth, output_width);
+  }
  };
  
-// Loads 3x3 filter of depth 16 and adds filter offsets.
-inline Filter3x3x16 LoadFilterDepth16(const uint8* filter_ptr,
-                                      int32 filter_offset, int output_depth) {
-  Filter3x3x16 filter;
+template <>
+struct ConvKernel3x3FilterDepth8<2, 1, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    const int output_row_size = output_depth * output_width;
  
-  uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5,
-      temp_u8_6, temp_u8_7, temp_u8_8, temp_u8_9, temp_u8_10, temp_u8_11,
-      temp_u8_12, temp_u8_13, temp_u8_14, temp_u8_15, temp_u8_16, temp_u8_17;
-  int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
  
-  temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth);
-  temp_u8_1 = vld1_u8(filter_ptr + 0 * output_depth + 8);
-  temp_u8_2 = vld1_u8(filter_ptr + 1 * output_depth);
-  temp_u8_3 = vld1_u8(filter_ptr + 1 * output_depth + 8);
-  temp_u8_4 = vld1_u8(filter_ptr + 2 * output_depth);
-  temp_u8_5 = vld1_u8(filter_ptr + 2 * output_depth + 8);
-
-  temp_u8_6 = vld1_u8(filter_ptr + 3 * output_depth);
-  temp_u8_7 = vld1_u8(filter_ptr + 3 * output_depth + 8);
-  temp_u8_8 = vld1_u8(filter_ptr + 4 * output_depth);
-  temp_u8_9 = vld1_u8(filter_ptr + 4 * output_depth + 8);
-  temp_u8_10 = vld1_u8(filter_ptr + 5 * output_depth);
-  temp_u8_11 = vld1_u8(filter_ptr + 5 * output_depth + 8);
-
-  temp_u8_12 = vld1_u8(filter_ptr + 6 * output_depth);
-  temp_u8_13 = vld1_u8(filter_ptr + 6 * output_depth + 8);
-  temp_u8_14 = vld1_u8(filter_ptr + 7 * output_depth);
-  temp_u8_15 = vld1_u8(filter_ptr + 7 * output_depth + 8);
-  temp_u8_16 = vld1_u8(filter_ptr + 8 * output_depth);
-  temp_u8_17 = vld1_u8(filter_ptr + 8 * output_depth + 8);
-
-  filter.r0.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0));
-  filter.r0.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1));
-  filter.r0.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2));
-  filter.r0.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3));
-  filter.r0.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4));
-  filter.r0.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5));
-
-  filter.r1.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6));
-  filter.r1.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7));
-  filter.r1.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8));
-  filter.r1.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_9));
-  filter.r1.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_10));
-  filter.r1.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_11));
-
-  filter.r2.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_12));
-  filter.r2.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_13));
-  filter.r2.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_14));
-  filter.r2.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_15));
-  filter.r2.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_u8_16));
-  filter.r2.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_u8_17));
-
-  filter.r0.v0.low = vaddq_s16(filter.r0.v0.low, filter_offset_vec);
-  filter.r0.v0.high = vaddq_s16(filter.r0.v0.high, filter_offset_vec);
-  filter.r0.v1.low = vaddq_s16(filter.r0.v1.low, filter_offset_vec);
-  filter.r0.v1.high = vaddq_s16(filter.r0.v1.high, filter_offset_vec);
-  filter.r0.v2.low = vaddq_s16(filter.r0.v2.low, filter_offset_vec);
-  filter.r0.v2.high = vaddq_s16(filter.r0.v2.high, filter_offset_vec);
-
-  filter.r1.v0.low = vaddq_s16(filter.r1.v0.low, filter_offset_vec);
-  filter.r1.v0.high = vaddq_s16(filter.r1.v0.high, filter_offset_vec);
-  filter.r1.v1.low = vaddq_s16(filter.r1.v1.low, filter_offset_vec);
-  filter.r1.v1.high = vaddq_s16(filter.r1.v1.high, filter_offset_vec);
-  filter.r1.v2.low = vaddq_s16(filter.r1.v2.low, filter_offset_vec);
-  filter.r1.v2.high = vaddq_s16(filter.r1.v2.high, filter_offset_vec);
-
-  filter.r2.v0.low = vaddq_s16(filter.r2.v0.low, filter_offset_vec);
-  filter.r2.v0.high = vaddq_s16(filter.r2.v0.high, filter_offset_vec);
-  filter.r2.v1.low = vaddq_s16(filter.r2.v1.low, filter_offset_vec);
-  filter.r2.v1.high = vaddq_s16(filter.r2.v1.high, filter_offset_vec);
-  filter.r2.v2.low = vaddq_s16(filter.r2.v2.low, filter_offset_vec);
-  filter.r2.v2.high = vaddq_s16(filter.r2.v2.high, filter_offset_vec);
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
  
-  return filter;
-}
+    const uint8* ptr = input_ptr;
  
-// Loads 3 input cells of depth 16 and adds input offsets.
-inline Int16x16x3 LoadInputRowDepth16(const uint8* ptr, int input_depth,
-                                      int32 input_offset,
-                                      Int16x16x3 input_row) {
-  uint8x8_t temp_0, temp_1;
-  int16x8_t offset_vec = vdupq_n_s16(input_offset);
-
-  temp_0 = vld1_u8(ptr + 0 * input_depth);
-  temp_1 = vld1_u8(ptr + 0 * input_depth + 8);
-  input_row.v0.low = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-  input_row.v0.high = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-  input_row.v0.low = vaddq_s16(input_row.v0.low, offset_vec);
-  input_row.v0.high = vaddq_s16(input_row.v0.high, offset_vec);
-
-  temp_0 = vld1_u8(ptr + 1 * input_depth);
-  temp_1 = vld1_u8(ptr + 1 * input_depth + 8);
-  input_row.v1.low = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-  input_row.v1.high = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-  input_row.v1.low = vaddq_s16(input_row.v1.low, offset_vec);
-  input_row.v1.high = vaddq_s16(input_row.v1.high, offset_vec);
-
-  temp_0 = vld1_u8(ptr + 2 * input_depth);
-  temp_1 = vld1_u8(ptr + 2 * input_depth + 8);
-  input_row.v2.low = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-  input_row.v2.high = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-  input_row.v2.low = vaddq_s16(input_row.v2.low, offset_vec);
-  input_row.v2.high = vaddq_s16(input_row.v2.high, offset_vec);
-
-  return input_row;
-}
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
  
-// Performs multiply accumulate on 3 inputs of depth 16.
-inline Int32x16 MultiplyAccumulateRowDepth16(Int32x16 output,
-                                             const Int16x16x3& filter_row,
-                                             const Int16x16x3& input_row) {
-  output.v0 = vmlal_s16(output.v0, vget_low_s16(filter_row.v0.low),
-                        vget_low_s16(input_row.v0.low));
-  output.v1 = vmlal_s16(output.v1, vget_high_s16(filter_row.v0.low),
-                        vget_high_s16(input_row.v0.low));
-  output.v2 = vmlal_s16(output.v2, vget_low_s16(filter_row.v0.high),
-                        vget_low_s16(input_row.v0.high));
-  output.v3 = vmlal_s16(output.v3, vget_high_s16(filter_row.v0.high),
-                        vget_high_s16(input_row.v0.high));
-
-  output.v0 = vmlal_s16(output.v0, vget_low_s16(filter_row.v1.low),
-                        vget_low_s16(input_row.v1.low));
-  output.v1 = vmlal_s16(output.v1, vget_high_s16(filter_row.v1.low),
-                        vget_high_s16(input_row.v1.low));
-  output.v2 = vmlal_s16(output.v2, vget_low_s16(filter_row.v1.high),
-                        vget_low_s16(input_row.v1.high));
-  output.v3 = vmlal_s16(output.v3, vget_high_s16(filter_row.v1.high),
-                        vget_high_s16(input_row.v1.high));
-
-  output.v0 = vmlal_s16(output.v0, vget_low_s16(filter_row.v2.low),
-                        vget_low_s16(input_row.v2.low));
-  output.v1 = vmlal_s16(output.v1, vget_high_s16(filter_row.v2.low),
-                        vget_high_s16(input_row.v2.low));
-  output.v2 = vmlal_s16(output.v2, vget_low_s16(filter_row.v2.high),
-                        vget_low_s16(input_row.v2.high));
-  output.v3 = vmlal_s16(output.v3, vget_high_s16(filter_row.v2.high),
-                        vget_high_s16(input_row.v2.high));
-
-  return output;
-}
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
  
-// Applies activation, offset and downquantize on a set of accumulator
-// registers of depth 16. Stores results to output.
-inline void DownquantizeAndStoreDepth16(Int32x16 acc, int32 output_multiplier,
-                                        int output_shift,
-                                        int32x4_t output_offset_vec,
-                                        int32x4_t output_activation_min_vec,
-                                        int32x4_t output_activation_max_vec,
-                                        uint8* output_ptr) {
-  // Fixed-point multiplication.
-  acc.v0 = vqrdmulhq_n_s32(acc.v0, output_multiplier);
-  acc.v1 = vqrdmulhq_n_s32(acc.v1, output_multiplier);
-  acc.v2 = vqrdmulhq_n_s32(acc.v2, output_multiplier);
-  acc.v3 = vqrdmulhq_n_s32(acc.v3, output_multiplier);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
  
-  using gemmlowp::RoundingDivideByPOT;
-  acc.v0 = RoundingDivideByPOT(acc.v0, output_shift);
-  acc.v1 = RoundingDivideByPOT(acc.v1, output_shift);
-  acc.v2 = RoundingDivideByPOT(acc.v2, output_shift);
-  acc.v3 = RoundingDivideByPOT(acc.v3, output_shift);
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
  
-  // Add the output offset.
-  acc.v0 = vaddq_s32(acc.v0, output_offset_vec);
-  acc.v1 = vaddq_s32(acc.v1, output_offset_vec);
-  acc.v2 = vaddq_s32(acc.v2, output_offset_vec);
-  acc.v3 = vaddq_s32(acc.v3, output_offset_vec);
+    // Second output.
+    output_ptr += output_row_size;
  
-  // Apply the activation function.
-  acc.v0 = vmaxq_s32(acc.v0, output_activation_min_vec);
-  acc.v1 = vmaxq_s32(acc.v1, output_activation_min_vec);
-  acc.v2 = vmaxq_s32(acc.v2, output_activation_min_vec);
-  acc.v3 = vmaxq_s32(acc.v3, output_activation_min_vec);
+    ptr += input_row_size;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
  
-  acc.v0 = vminq_s32(acc.v0, output_activation_max_vec);
-  acc.v1 = vminq_s32(acc.v1, output_activation_max_vec);
-  acc.v2 = vminq_s32(acc.v2, output_activation_max_vec);
-  acc.v3 = vminq_s32(acc.v3, output_activation_max_vec);
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
  
-  // Saturating cast to uint8 and store to destination.
-  int16x4_t acc_tlla_s16 = vqmovn_s32(acc.v0);
-  int16x4_t acc_tllb_s16 = vqmovn_s32(acc.v1);
-  int16x4_t acc_tlha_s16 = vqmovn_s32(acc.v2);
-  int16x4_t acc_tlhb_s16 = vqmovn_s32(acc.v3);
-
-  int16x8_t res_s16_0 = vcombine_s16(acc_tlla_s16, acc_tllb_s16);
-  int16x8_t res_s16_1 = vcombine_s16(acc_tlha_s16, acc_tlhb_s16);
-  uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
-  uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
-  vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
+        input_4, input_5, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 2, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 3 * input_depth;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
+        input_6, input_7, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 4, 2> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
+        temp_8;
+
+    const uint8* ptr = input_ptr;
+
+    // Load all inputs for top output.
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    temp_2 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    temp_5 = vld1_u8(ptr + 2 * input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+    temp_8 = vld1_u8(ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Second output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 3 * input_depth;
+    temp_0 = vld1_u8(ptr);
+    temp_1 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_3 = vld1_u8(ptr);
+    temp_4 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_6 = vld1_u8(ptr);
+    temp_7 = vld1_u8(ptr + input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
+        input_6, input_7, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Third output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 5 * input_depth;
+    temp_2 = vld1_u8(ptr);
+    temp_0 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_5 = vld1_u8(ptr);
+    temp_3 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_8 = vld1_u8(ptr);
+    temp_6 = vld1_u8(ptr + input_depth);
+
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_1, input_2, input_0, input_4, input_5, input_3, input_7,
+        input_8, input_6, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+
+    // Fourth output.
+    output_ptr += output_depth;
+
+    ptr = input_ptr + 7 * input_depth;
+    temp_1 = vld1_u8(ptr);
+    temp_2 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_4 = vld1_u8(ptr);
+    temp_5 = vld1_u8(ptr + input_depth);
+    ptr += input_row_size;
+    temp_7 = vld1_u8(ptr);
+    temp_8 = vld1_u8(ptr + input_depth);
+
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+template <>
+struct ConvKernel3x3FilterDepth8<1, 1> {
+  static inline void Run(const uint8* input_ptr, int input_depth,
+                         int32 input_offset, int input_row_size,
+                         const uint8* filter_ptr, int32 filter_offset,
+                         const int32* bias_ptr, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_ptr,
+                         int output_depth, int output_width) {
+    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
+
+    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8;
+
+    uint8x8_t temp_0 = vld1_u8(input_ptr);
+    uint8x8_t temp_1 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_2 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_ptr += input_row_size;
+    uint8x8_t temp_3 = vld1_u8(input_ptr);
+    uint8x8_t temp_4 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_5 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_ptr += input_row_size;
+    uint8x8_t temp_6 = vld1_u8(input_ptr);
+    uint8x8_t temp_7 = vld1_u8(input_ptr + input_depth);
+    uint8x8_t temp_8 = vld1_u8(input_ptr + 2 * input_depth);
+
+    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
+    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
+    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
+    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
+    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
+    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
+    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
+    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
+    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
+
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    input_0 = vaddq_s16(input_0, input_offset_vec);
+    input_1 = vaddq_s16(input_1, input_offset_vec);
+    input_2 = vaddq_s16(input_2, input_offset_vec);
+    input_3 = vaddq_s16(input_3, input_offset_vec);
+    input_4 = vaddq_s16(input_4, input_offset_vec);
+    input_5 = vaddq_s16(input_5, input_offset_vec);
+    input_6 = vaddq_s16(input_6, input_offset_vec);
+    input_7 = vaddq_s16(input_7, input_offset_vec);
+    input_8 = vaddq_s16(input_8, input_offset_vec);
+
+    DotProductAndStore(
+        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
+        input_7, input_8, bias_ptr, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_ptr);
+  }
+};
+
+inline void ShuffleInput(const uint8* input_ptr, int input_depth,
+                         int input_width, int input_height, int output_depth,
+                         int output_width, int output_height,
+                         uint8* output_ptr) {
+  const int input_row_size = input_depth * input_width;
+
+  for (int y = 0; y < output_height; y++) {
+    const uint8* ptr = input_ptr;
+    for (int x = 0; x < output_width; x++) {
+      memcpy(output_ptr, ptr, output_depth);
+      output_ptr += output_depth;
+      ptr += input_depth;
+    }
+    input_ptr += input_row_size;
+  }
  }
  
-// A kernel that is optimized on the number of output cells in the x and y
-// direction, and the stride. Assumes 3x3 filters of 16 depth.
-template <int kFixedOutputX, int kFixedOutputY, int kFixedStride = 1>
-struct ConvKernel3x3FilterDepth16 {};
+template <int kFixedHeight, int kFixedStrideWidth,
+          int kFixedStrideHeight = kFixedStrideWidth>
+struct ConvRow3x3FilterDepth8 {};
+
+template <int kFixedStrideWidth>
+struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 1x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<1, 4, kFixedStrideWidth>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * kFixedStrideWidth * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // 1x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += kFixedStrideWidth * input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <int kFixedStrideWidth>
+struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 2x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 4, kFixedStrideWidth>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * kFixedStrideWidth * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // 2x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 2, kFixedStrideWidth>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * kFixedStrideWidth * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 2x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<2, 1, kFixedStrideWidth>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += kFixedStrideWidth * input_depth;
+      output_data += output_depth;
+    }
+  }
+};
+
+template <>
+struct ConvRow3x3FilterDepth8<4, 1> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+
+    // 4x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 4, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+    // Handle the rest of the right side.
+    // 4x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 2, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 4x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 1, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += input_depth;
+      output_data += output_depth;
+    }
+  }
+};
  
  template <>
-struct ConvKernel3x3FilterDepth16<1, 2, 1> {
-  static void Run(const Filter3x3x16& filter, const uint8* input_ptr,
-                  int input_depth, int32 input_offset, int input_row_width,
-                  const int32* bias_ptr, int32 output_offset,
-                  int32 output_multiplier, int output_shift,
-                  int32 output_activation_min, int32 output_activation_max,
-                  uint8* output_ptr, int output_depth, int output_width) {
-    // 16 depth accumulators for the 2 outputs.
-    Int32x16 acc0, acc1;
-
-    // Accumulators for top filter.
-    acc0.v0 = vld1q_s32(bias_ptr);
-    acc0.v1 = vld1q_s32(bias_ptr + 4);
-    acc0.v2 = vld1q_s32(bias_ptr + 8);
-    acc0.v3 = vld1q_s32(bias_ptr + 12);
-    // Accumulators for bottom filter.
-    acc1.v0 = vld1q_s32(bias_ptr);
-    acc1.v1 = vld1q_s32(bias_ptr + 4);
-    acc1.v2 = vld1q_s32(bias_ptr + 8);
-    acc1.v3 = vld1q_s32(bias_ptr + 12);
-
-    // Main multiply accumulate work.
-    {
-      // Load inputs for one filter row at a time.
-      Int16x16x3 input;
-
-      // Do first row of top filter.
-      input = LoadInputRowDepth16(input_ptr, input_depth, input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r0, input);
-
-      // Do second row of top filter.
-      input = LoadInputRowDepth16(input_ptr + input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r1, input);
-
-      // The inputs to second row of the top filter are also the inputs to the
-      // first row of the bottom filter.
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r0, input);
-
-      // Do third row of top filter.
-      input = LoadInputRowDepth16(input_ptr + 2 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r2, input);
-
-      // The inputs to third row of the top filter are also the inputs to the
-      // second row of the bottom filter.
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r1, input);
-
-      // Do third row of bottom filter.
-      input = LoadInputRowDepth16(input_ptr + 3 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r2, input);
-    }
-
-    // Apply activation, downquantize and store.
-    int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
-    int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
-
-    DownquantizeAndStoreDepth16(acc0, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec, output_ptr);
-
-    DownquantizeAndStoreDepth16(acc1, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec,
-                                output_ptr + output_depth * output_width);
+struct ConvRow3x3FilterDepth8<4, 2> {
+  // The buffer size of the shuffled input.
+  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 9 * 9; }
+
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    // Branch and cache misses increase substantially with stride 2 kernels.
+    // Adding prefetching reduces latency by as much as 2x.
+    const int i0 = 0;
+    const int i1 = input_depth;
+    const int i2 = 2 * input_depth;
+    const int i3 = 3 * input_depth;
+    const int i4 = 4 * input_depth;
+    const int i5 = 5 * input_depth;
+    const int i6 = 6 * input_depth;
+    const int i7 = 7 * input_depth;
+    const int i8 = 8 * input_depth;
+
+#define DEPTHWISECONV_PRELOAD_ROW(input_ptr, i)         \
+  preload_l1_keep(input_ptr + i * input_row_size + i0); \
+  preload_l1_keep(input_ptr + i * input_row_size + i1); \
+  preload_l1_keep(input_ptr + i * input_row_size + i2); \
+  preload_l1_keep(input_ptr + i * input_row_size + i3); \
+  preload_l1_keep(input_ptr + i * input_row_size + i4); \
+  preload_l1_keep(input_ptr + i * input_row_size + i5); \
+  preload_l1_keep(input_ptr + i * input_row_size + i6); \
+  preload_l1_keep(input_ptr + i * input_row_size + i7); \
+  preload_l1_keep(input_ptr + i * input_row_size + i8);
+
+    int out_x = start_x;
+    // 4x4 at a time.
+    for (; out_x <= output_width - 4; out_x += 4) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      int depth = 0;
+      for (; depth <= output_depth - 64; depth += 64) {
+        // Preload 9x9 input.
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
+        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
+
+        // For a large input window (64x9x9) that is small enough to fit in L1
+        // cache, copy the input into a separate buffer and run the kernel on
+        // this new buffer. This reduces the likelihood of cache misses when
+        // the kernel is loading input data. If this size is ever changed,
+        // update the ShuffleWorkspaceSize() function to return the new size.
+        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 9,
+                     9, shuffle_workspace);
+        const uint8* shuffled_ptr = &shuffle_workspace[0];
+
+        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
+          ConvKernel3x3FilterDepth8<4, 4, 2>::Run(
+              shuffled_ptr, 64, input_offset, 64 * 9, filter_ptr, filter_offset,
+              bias_ptr, output_offset, output_multiplier, output_shift,
+              output_activation_min, output_activation_max, output_ptr,
+              output_depth, output_width);
+
+          shuffled_ptr += 8;
+          output_ptr += 8;
+          filter_ptr += 8;
+          bias_ptr += 8;
+        }
+        input_ptr += 64;
+      }
+
+      // Preload 9x9 input one more time for the rest of the depth.
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
+      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
+
+      for (; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 4, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 4 * 2 * input_depth;
+      output_data += 4 * output_depth;
+    }
+
+#undef DEPTHWISECONV_PRELOAD_ROW
+
+    // Handle the rest of the right side.
+    // 4x2 at a time.
+    for (; out_x <= output_width - 2; out_x += 2) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 2, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * 2 * input_depth;
+      output_data += 2 * output_depth;
+    }
+
+    // 4x1 at a time.
+    for (; out_x < output_width; out_x++) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<4, 1, 2>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 2 * input_depth;
+      output_data += output_depth;
+    }
    }
  };
  
  template <>
-struct ConvKernel3x3FilterDepth16<1, 2, 2> {
-  static void Run(const Filter3x3x16& filter, const uint8* input_ptr,
-                  int input_depth, int32 input_offset, int input_row_width,
-                  const int32* bias_ptr, int32 output_offset,
-                  int32 output_multiplier, int output_shift,
-                  int32 output_activation_min, int32 output_activation_max,
-                  uint8* output_ptr, int output_depth, int output_width) {
-    // 16 depth accumulators for the 2 outputs.
-    Int32x16 acc0, acc1;
-
-    // Accumulators for top filter.
-    acc0.v0 = vld1q_s32(bias_ptr);
-    acc0.v1 = vld1q_s32(bias_ptr + 4);
-    acc0.v2 = vld1q_s32(bias_ptr + 8);
-    acc0.v3 = vld1q_s32(bias_ptr + 12);
-    // Accumulators for bottom filter.
-    acc1.v0 = vld1q_s32(bias_ptr);
-    acc1.v1 = vld1q_s32(bias_ptr + 4);
-    acc1.v2 = vld1q_s32(bias_ptr + 8);
-    acc1.v3 = vld1q_s32(bias_ptr + 12);
-
-    // Main multiply accumulate work.
-    {
-      // Load inputs for one filter row at a time.
-      Int16x16x3 input;
-
-      // Do first row of top filter.
-      input = LoadInputRowDepth16(input_ptr, input_depth, input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r0, input);
-
-      // Do second row of top filter.
-      input = LoadInputRowDepth16(input_ptr + input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r1, input);
-
-      // Do third row of top filter.
-      input = LoadInputRowDepth16(input_ptr + 2 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc0 = MultiplyAccumulateRowDepth16(acc0, filter.r2, input);
-
-      // The inputs to third row of the top filter are also the inputs
-      // to first row of the bottom filter.
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r0, input);
-
-      // Do second row of bottom filter.
-      input = LoadInputRowDepth16(input_ptr + 3 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r1, input);
-
-      // Do third row of bottom filter.
-      input = LoadInputRowDepth16(input_ptr + 4 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc1 = MultiplyAccumulateRowDepth16(acc1, filter.r2, input);
-    }
-
-    // Apply activation, downquantize and store.
-    int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
-    int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
-
-    DownquantizeAndStoreDepth16(acc0, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec, output_ptr);
-
-    DownquantizeAndStoreDepth16(acc1, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec,
-                                output_ptr + output_depth * output_width);
+struct ConvRow3x3FilterDepth8<8, 2> {
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    // Reuse 4 row kernels twice.
+    ConvRow3x3FilterDepth8<4, 2>::Run(
+        input_data, start_x, start_y, input_depth, input_width, input_height,
+        input_row_size, input_offset, filter_data, filter_offset, bias_data,
+        output_offset, output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_data, output_depth, output_width,
+        shuffle_workspace);
+
+    ConvRow3x3FilterDepth8<4, 2>::Run(
+        input_data + 2 * 4 * input_row_size, start_x, start_y + 4, input_depth,
+        input_width, input_height, input_row_size, input_offset, filter_data,
+        filter_offset, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_data + 4 * output_depth * output_width, output_depth,
+        output_width, shuffle_workspace);
    }
  };
  
  template <>
-struct ConvKernel3x3FilterDepth16<1, 1> {
-  static void Run(const Filter3x3x16& filter, const uint8* input_ptr,
-                  int input_depth, int32 input_offset, int input_row_width,
-                  const int32* bias_ptr, int32 output_offset,
-                  int32 output_multiplier, int output_shift,
-                  int32 output_activation_min, int32 output_activation_max,
-                  uint8* output_ptr, int output_depth, int output_width) {
-    Int32x16 acc;
-    acc.v0 = vld1q_s32(bias_ptr);
-    acc.v1 = vld1q_s32(bias_ptr + 4);
-    acc.v2 = vld1q_s32(bias_ptr + 8);
-    acc.v3 = vld1q_s32(bias_ptr + 12);
-
-    // Main multiply accumulate work.
-    {
-      // Load inputs for one filter row at a time.
-      Int16x16x3 input;
-
-      // Do first row.
-      input = LoadInputRowDepth16(input_ptr, input_depth, input_offset, input);
-      acc = MultiplyAccumulateRowDepth16(acc, filter.r0, input);
-
-      // Do second row.
-      input = LoadInputRowDepth16(input_ptr + input_row_width, input_depth,
-                                  input_offset, input);
-      acc = MultiplyAccumulateRowDepth16(acc, filter.r1, input);
-
-      // Do third row.
-      input = LoadInputRowDepth16(input_ptr + 2 * input_row_width, input_depth,
-                                  input_offset, input);
-      acc = MultiplyAccumulateRowDepth16(acc, filter.r2, input);
-    }
-
-    // Apply activation, downquantize and store.
-    int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
-    int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
-
-    DownquantizeAndStoreDepth16(acc, output_multiplier, output_shift,
-                                output_offset_vec, output_activation_min_vec,
-                                output_activation_max_vec, output_ptr);
+struct ConvRow3x3FilterDepth8<8, 1> {
+  // The buffer size of the shuffled input.
+  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 10 * 10; }
+
+  static inline void Run(const uint8* input_data, int start_x, int start_y,
+                         int input_depth, int input_width, int input_height,
+                         int input_row_size, int32 input_offset,
+                         const uint8* filter_data, int32 filter_offset,
+                         const int32* bias_data, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         int output_depth, int output_width,
+                         uint8* shuffle_workspace) {
+    int out_x = start_x;
+    // 8x8 at a time.
+    for (; out_x <= output_width - 8; out_x += 8) {
+      const int32* bias_ptr = bias_data;
+      const uint8* filter_ptr = filter_data;
+
+      const uint8* input_ptr = input_data;
+      uint8* output_ptr = output_data;
+
+      int depth = 0;
+      for (; depth <= output_depth - 64; depth += 64) {
+        // For a large input window (64x10x10) that is small enough to fit in L1
+        // cache, copy the input into a separate buffer and run the kernel on
+        // this new buffer. This reduces the likelihood of cache misses when
+        // the kernel is loading input data. If the size of the input window
+        // changes, update the function ShuffleWorkspaceSize() with the new
+        // size.
+        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 10,
+                     10, shuffle_workspace);
+        const uint8* shuffled_ptr = shuffle_workspace;
+
+        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
+          ConvKernel3x3FilterDepth8<8, 8, 1>::Run(
+              shuffled_ptr, 64, input_offset, 64 * 10, filter_ptr,
+              filter_offset, bias_ptr, output_offset, output_multiplier,
+              output_shift, output_activation_min, output_activation_max,
+              output_ptr, output_depth, output_width);
+
+          shuffled_ptr += 8;
+          output_ptr += 8;
+          filter_ptr += 8;
+          bias_ptr += 8;
+        }
+        input_ptr += 64;
+      }
+
+      for (; depth <= output_depth - 8; depth += 8) {
+        ConvKernel3x3FilterDepth8<8, 8, 1>::Run(
+            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
+            filter_offset, bias_ptr, output_offset, output_multiplier,
+            output_shift, output_activation_min, output_activation_max,
+            output_ptr, output_depth, output_width);
+
+        input_ptr += 8;
+        output_ptr += 8;
+        filter_ptr += 8;
+        bias_ptr += 8;
+      }
+
+      input_data += 8 * input_depth;
+      output_data += 8 * output_depth;
+    }
+
+    // Handle the rest of the right side by re-using 4 row kernels twice.
+    ConvRow3x3FilterDepth8<4, 1>::Run(
+        input_data, out_x, start_y, input_depth, input_width, input_height,
+        input_row_size, input_offset, filter_data, filter_offset, bias_data,
+        output_offset, output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_data, output_depth, output_width,
+        shuffle_workspace);
+
+    ConvRow3x3FilterDepth8<4, 1>::Run(
+        input_data + 4 * input_row_size, out_x, start_y + 4, input_depth,
+        input_width, input_height, input_row_size, input_offset, filter_data,
+        filter_offset, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_data + 4 * output_depth * output_width, output_depth,
+        output_width, shuffle_workspace);
    }
  };
  
-inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
-                                          const Dims<4>& filter_dims,
-                                          int stride_width, int stride_height,
-                                          int pad_width, int pad_height,
-                                          int depth_multiplier,
-                                          const Dims<4>& output_dims) {
+inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims,
+                                         const Dims<4>& filter_dims,
+                                         int stride_width, int stride_height,
+                                         int pad_width, int pad_height,
+                                         int depth_multiplier,
+                                         const Dims<4>& output_dims) {
    const int input_height = ArraySize(input_dims, 2);
    const int input_width = ArraySize(input_dims, 1);
    const int input_depth = ArraySize(input_dims, 0);
@@ -458,14 +4426,14 @@ inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
                     depth_multiplier == 1 &&
                     (stride_width == 1 || stride_width == 2) &&
                     (stride_height == 1 || stride_height == 2) &&
-                   pad_width == 0 && pad_height == 0 && (input_depth % 16) == 0;
+                   pad_width == 0 && pad_height == 0 && (input_depth % 8) == 0;
  
    if (!supported) {
      return false;
    }
  
-  // Handle case where padding is zero but type is not kValid. This would
-  // require special boundary case handling that is not supported yet.
+  // Handle case where padding is zero but padding type is not kValid.
+  // This would require special boundary case handling that is not supported.
  
    const int out_x = output_width - 1;
    const int out_y = output_height - 1;
@@ -481,7 +4449,7 @@ inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
    return in_x_end <= input_width && in_y_end <= input_height;
  }
  
-inline void DepthwiseConv3by3FilterDepth16(
+inline void DepthwiseConv3x3Filter(
      const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
      const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
      const int32* bias_data, const Dims<4>& bias_dims, int stride_width,
@@ -500,241 +4468,108 @@ inline void DepthwiseConv3by3FilterDepth16(
    const int output_width = ArraySize(output_dims, 1);
  
    // Algorithm assumes below constraints. It is optimized for depth multiplier
-  // of 1, 3x3 filter, no padding, strides 1 and 2.
+  // of 1, 3x3 filter, no padding and strides 1 and 2.
    TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
    TFLITE_DCHECK(depth_multiplier == 1);
    TFLITE_DCHECK(filter_height == 3);
    TFLITE_DCHECK(filter_width == 3);
    TFLITE_DCHECK(pad_height == 0);
    TFLITE_DCHECK(pad_width == 0);
-  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
    TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
+  TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
  
-  // The number of outputs to process in the main loop.
-  const int num_x_outputs = 1;
-  const int num_y_outputs = 2;
-
-  const int input_row_width = output_depth * (input_width + 2 * pad_width);
-  const int input_batch_size =
-      input_row_width * (input_height + 2 * pad_height);
+  const int input_row_size = input_depth * (input_width + 2 * pad_width);
+  const int output_row_size = output_depth * output_width;
+  const int input_batch_size = input_row_size * (input_height + 2 * pad_height);
    const int output_batch_size = output_depth * output_width * output_height;
-  const int input_ptr_x_increment = input_depth * stride_width;
  
-  // Calculate extents of non-boundary loop.
-  int out_x_start = 0;
-  for (; out_x_start < input_width; out_x_start++) {
-    int in_x = (out_x_start * stride_width) - pad_width;
-    if (in_x >= 0) {
-      break;
-    }
-  }
-  int out_x_end = output_width - 1;
-  for (; out_x_end >= 0; out_x_end--) {
-    int in_x = (out_x_end * stride_width) - pad_width;
-    int in_x_end = in_x + filter_width + (num_x_outputs - 1) * stride_width;
-    if (in_x_end <= input_width) {
-      out_x_end++;
-      break;
-    }
-  }
-  int out_y_start = 0;
-  for (; out_y_start < input_height; out_y_start++) {
-    int in_y = (out_y_start * stride_height) - pad_height;
-    if (in_y >= 0) {
-      break;
-    }
-  }
-  int out_y_end = output_height - 1;
-  for (; out_y_end >= 0; out_y_end--) {
-    int in_y = (out_y_end * stride_height) - pad_height;
-    int in_y_end = in_y + filter_height + (num_y_outputs - 1) * stride_height;
-    if (in_y_end <= input_height) {
-      out_y_end++;
-      break;
-    }
+  using conv_row_func_t = decltype(&ConvRow3x3FilterDepth8<1, 1>::Run);
+  conv_row_func_t conv_1_output_row = ConvRow3x3FilterDepth8<1, 1>::Run;
+  conv_row_func_t conv_2_output_rows = ConvRow3x3FilterDepth8<2, 1>::Run;
+  conv_row_func_t conv_4_output_rows = ConvRow3x3FilterDepth8<4, 1>::Run;
+  conv_row_func_t conv_8_output_rows = ConvRow3x3FilterDepth8<8, 1>::Run;
+
+  if (stride_width == 2) {
+    conv_1_output_row = ConvRow3x3FilterDepth8<1, 2>::Run;
+    conv_2_output_rows = ConvRow3x3FilterDepth8<2, 2>::Run;
+    conv_4_output_rows = ConvRow3x3FilterDepth8<4, 2>::Run;
+    conv_8_output_rows = ConvRow3x3FilterDepth8<8, 2>::Run;
    }
  
-  using dot_product_func_t =
-      decltype(&ConvKernel3x3FilterDepth16<1, 2, 1>::Run);
-  dot_product_func_t dot_product_func = nullptr;
+  // Allocate maximum memory needed for shuffled input.
+  // TODO(mariewhite): The size of this workspace is small enough to be
+  // allocated on the stack. Eventually we will want to move it to the heap
+  // and have it allocated outside of this function, like the im2col_array used
+  // in gemmlowp.
+#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
+  uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
  
-  if (stride_width == 1 && stride_height == 1) {
-    dot_product_func = ConvKernel3x3FilterDepth16<1, 2, 1>::Run;
-  } else {
-    dot_product_func = ConvKernel3x3FilterDepth16<1, 2, 2>::Run;
-  }
+  // Make sure the kernels using this buffer will not run out of bounds.
+  static_assert(ConvRow3x3FilterDepth8<8, 1>::ShuffleWorkspaceSize() <=
+                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
+                "Shuffle workspace size is too small.");
+  static_assert(ConvRow3x3FilterDepth8<4, 2>::ShuffleWorkspaceSize() <=
+                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
+                "Shuffle workspace size is too small.");
  
-  // Offsets for preloading inputs.
-  const int i0 = 0;
-  const int i1 = input_depth;
-  const int i2 = 2 * input_depth;
-  const int i3 = input_row_width;
-  const int i4 = input_row_width + input_depth;
-  const int i5 = input_row_width + 2 * input_depth;
-  const int i6 = 2 * input_row_width;
-  const int i7 = 2 * input_row_width + input_depth;
-  const int i8 = 2 * input_row_width + 2 * input_depth;
-  const int i9 = 3 * input_row_width;
-  const int i10 = 3 * input_row_width + input_depth;
-  const int i11 = 3 * input_row_width + 2 * input_depth;
-  const int i12 = 4 * input_row_width;
-  const int i13 = 4 * input_row_width + input_depth;
-  const int i14 = 4 * input_row_width + 2 * input_depth;
+#undef DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE
  
    for (int b = 0; b < batches; ++b) {
-    const int32* bias_ptr = bias_data;
-    const uint8* filter_ptr = filter_data;
-
-    const int in_batch_offset = b * input_batch_size;
-    const int out_batch_offset = b * output_batch_size;
-
-    int depth = 0;
-    for (; depth <= output_depth - 16; depth += 16) {
-      Filter3x3x16 filter =
-          LoadFilterDepth16(filter_ptr, filter_offset, output_depth);
-
-      // Handle 1x2 outputs.
-      int out_y = out_y_start;
-      for (; out_y < out_y_end; out_y += num_y_outputs) {
-        int out_x = out_x_start;
-
-        int in_y_offset =
-            stride_height * input_row_width * (out_y + pad_height);
-        int in_x_offset = stride_width * input_depth * (out_x + pad_width);
-
-        const uint8* input_ptr =
-            input_data + depth + in_x_offset + in_y_offset + in_batch_offset;
-
-        // Preload inputs. If input depth is large, preload every value of the
-        // input for this depth range. Otherwise, preload only the first values
-        // of each row.
-        if (input_depth >= 32) {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i1);
-          preload_l1_keep(input_ptr + i2);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i4);
-          preload_l1_keep(input_ptr + i5);
-          preload_l1_keep(input_ptr + i6);
-          preload_l1_keep(input_ptr + i7);
-          preload_l1_keep(input_ptr + i8);
-          preload_l1_keep(input_ptr + i9);
-          preload_l1_keep(input_ptr + i10);
-          preload_l1_keep(input_ptr + i11);
-
-          if (stride_height == 2) {
-            preload_l1_keep(input_ptr + i12);
-            preload_l1_keep(input_ptr + i13);
-            preload_l1_keep(input_ptr + i14);
-          }
-        } else {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i6);
-          preload_l1_keep(input_ptr + i9);
-
-          if (stride_height == 2) {
-            preload_l1_keep(input_ptr + i12);
-          }
-        }
+    const uint8* input_ptr = input_data + b * input_batch_size;
+    uint8* output_ptr = output_data + b * output_batch_size;
  
-        uint8* output_ptr = output_data + depth + (out_x * output_depth) +
-                            (output_depth * output_width * out_y) +
-                            out_batch_offset;
-
-        for (; out_x < out_x_end; out_x += num_x_outputs) {
-          dot_product_func(filter, input_ptr, input_depth, input_offset,
-                           input_row_width, bias_ptr, output_offset,
-                           output_multiplier, output_shift,
-                           output_activation_min, output_activation_max,
-                           output_ptr, output_depth, output_width);
-
-          input_ptr += input_ptr_x_increment * num_x_outputs;
-          output_ptr += output_depth * num_x_outputs;
-
-          // Preload the next inputs depending on stride.
-          if (stride_width == 1) {
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i8);
-            preload_l1_keep(input_ptr + i11);
-          } else if (stride_width == 2) {
-            preload_l1_keep(input_ptr + i1);
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i4);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i7);
-            preload_l1_keep(input_ptr + i8);
-            preload_l1_keep(input_ptr + i10);
-            preload_l1_keep(input_ptr + i11);
-            preload_l1_keep(input_ptr + i13);
-            preload_l1_keep(input_ptr + i14);
-          }
-        }
+    int out_y = 0;
  
-        // Handle the rest of the right side.
-        for (; out_x < output_width; out_x++) {
-          // This code path can only be reached if we're handling >1 x outputs
-          // at a time or support kSame padding.
-        }
-      }
+    // Handle 8 rows at a time.
+    for (; out_y <= output_height - 8; out_y += 8) {
+      conv_8_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
  
-      // Handle the rest of the bottom side.
-      for (; out_y < output_height; out_y++) {
-        int out_x = out_x_start;
-
-        int in_y_offset =
-            stride_height * input_row_width * (out_y + pad_height);
-        int in_x_offset = stride_width * input_depth * (out_x + pad_width);
-
-        const uint8* input_ptr =
-            input_data + depth + in_x_offset + in_y_offset + in_batch_offset;
-
-        if (input_depth >= 32) {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i1);
-          preload_l1_keep(input_ptr + i2);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i4);
-          preload_l1_keep(input_ptr + i5);
-          preload_l1_keep(input_ptr + i6);
-          preload_l1_keep(input_ptr + i7);
-        } else {
-          preload_l1_keep(input_ptr + i0);
-          preload_l1_keep(input_ptr + i3);
-          preload_l1_keep(input_ptr + i6);
-        }
+      input_ptr += 8 * stride_height * input_row_size;
+      output_ptr += 8 * output_row_size;
+    }
  
-        uint8* output_ptr = output_data + depth + (out_x * output_depth) +
-                            (output_depth * output_width * out_y) +
-                            out_batch_offset;
+    // Handle 4 rows at a time.
+    for (; out_y <= output_height - 4; out_y += 4) {
+      conv_4_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
  
-        for (; out_x < output_width; out_x++) {
-          ConvKernel3x3FilterDepth16<1, 1>::Run(
-              filter, input_ptr, input_depth, input_offset, input_row_width,
-              bias_ptr, output_offset, output_multiplier, output_shift,
-              output_activation_min, output_activation_max, output_ptr,
-              output_depth, output_width);
+      input_ptr += 4 * stride_height * input_row_size;
+      output_ptr += 4 * output_row_size;
+    }
  
-          input_ptr += input_ptr_x_increment;
-          output_ptr += output_depth;
-
-          if (stride_width == 1) {
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i8);
-          } else if (stride_width == 2) {
-            preload_l1_keep(input_ptr + i1);
-            preload_l1_keep(input_ptr + i2);
-            preload_l1_keep(input_ptr + i4);
-            preload_l1_keep(input_ptr + i5);
-            preload_l1_keep(input_ptr + i7);
-            preload_l1_keep(input_ptr + i8);
-          }
-        }
-      }
-      filter_ptr += 16;
-      bias_ptr += 16;
+    // Handle 2 rows at a time.
+    for (; out_y <= output_height - 2; out_y += 2) {
+      conv_2_output_rows(input_ptr, 0, out_y, input_depth, input_width,
+                         input_height, input_row_size, input_offset,
+                         filter_data, filter_offset, bias_data, output_offset,
+                         output_multiplier, output_shift, output_activation_min,
+                         output_activation_max, output_ptr, output_depth,
+                         output_width, shuffle_workspace);
+
+      input_ptr += 2 * stride_height * input_row_size;
+      output_ptr += 2 * output_row_size;
+    }
+
+    // Handle one row at a time.
+    for (; out_y < output_height; out_y++) {
+      conv_1_output_row(input_ptr, 0, out_y, input_depth, input_width,
+                        input_height, input_row_size, input_offset, filter_data,
+                        filter_offset, bias_data, output_offset,
+                        output_multiplier, output_shift, output_activation_min,
+                        output_activation_max, output_ptr, output_depth,
+                        output_width, shuffle_workspace);
+
+      input_ptr += stride_height * input_row_size;
+      output_ptr += output_row_size;
      }
    }
  }
author	A. Unique TensorFlower <gardener@tensorflow.org>
	Wed, 11 Apr 2018 20:29:12 +0000 (13:29 -0700)
committer	TensorFlower Gardener <gardener@tensorflow.org>
	Wed, 11 Apr 2018 20:31:29 +0000 (13:31 -0700)
tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h		patch \| blob \| history
tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h		patch \| blob \| history