compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
  19 #define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
  20
  21 #include "cker/Shape.h"
  22 #include "cker/Types.h"
  23 #include "cker/Utils.h"
  24 #include "cker/neon/neon_check.h"
  25
  26 #include <fixedpoint/fixedpoint.h>
  27 #include <public/gemmlowp.h>
  28
  29 namespace nnfw
  30 {
  31 namespace cker
  32 {
  33 namespace optimized
  34 {
  35
  36 // Implementation of quantized DepthwiseConv
  37
  38 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
  39 struct QuantizedDepthwiseConvKernel
  40 {
  41 };
  42
  43 #ifdef USE_NEON
  44 template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
  45 {
  46   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
  47                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
  48                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
  49   {
  50     (void)input_depth;
  51     (void)depth_multiplier;
  52     // Load the filters, add filter_offset.
  53     uint8x8x2_t filter_u8;
  54     filter_u8.val[0] = vld1_u8(filter_ptr);
  55     filter_u8.val[1] = vld1_u8(filter_ptr + 8);
  56     int16x8_t filter[2];
  57     for (int i = 0; i < 2; i++)
  58     {
  59       filter[i] =
  60           vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
  61     }
  62     // Handle one output pixel at a time.
  63     for (int outp = 0; outp < num_output_pixels; outp++)
  64     {
  65       // Load the accumulators from acc_buffer
  66       int32x4x2_t acc[2];
  67       for (int i = 0; i < 2; i++)
  68       {
  69         acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
  70         acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
  71       }
  72       // Load the inputs, add input_offset.
  73       const uint8x8_t input_u8 = vld1_u8(input_ptr);
  74       input_ptr += input_ptr_increment;
  75       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
  76       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
  77       // Duplicate the input values, 2-fold
  78       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
  79       // Multiply-accumulate
  80       for (int i = 0; i < 2; i++)
  81       {
  82         acc[0].val[i] =
  83             vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
  84         acc[1].val[i] =
  85             vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
  86       }
  87       // Store the accumulators back to acc_buffer
  88       for (int i = 0; i < 2; i++)
  89       {
  90         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
  91         vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
  92       }
  93       acc_buffer_ptr += 16;
  94     }
  95   }
  96 };
  97
  98 template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
  99 {
 100   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 101                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 102                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 103   {
 104     (void)input_depth;
 105     (void)depth_multiplier;
 106     (void)input_ptr_increment;
 107     // Load the filters, add filter_offset.
 108     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
 109     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
 110     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 111
 112     int outp = 0;
 113     // Handle 2 output pixels at a time.
 114     for (; outp <= num_output_pixels - 2; outp += 2)
 115     {
 116       // Load the accumulators from acc_buffer.
 117       int32x4_t acc[4];
 118       for (int i = 0; i < 4; i++)
 119       {
 120         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 121       }
 122       // Load the inputs, add input_offset.
 123       uint8x8_t input_u8[2];
 124       for (int i = 0; i < 2; i++)
 125       {
 126         input_u8[i] = vld1_u8(input_ptr + 8 * i);
 127       }
 128       input_ptr += 16;
 129       int16x8_t input[2];
 130       for (int i = 0; i < 2; i++)
 131       {
 132         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
 133       }
 134       for (int i = 0; i < 2; i++)
 135       {
 136         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
 137       }
 138       // Multiply-accumulate.
 139       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
 140       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
 141       acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
 142       acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
 143       // Store the accumulators back to acc_buffer
 144       for (int i = 0; i < 4; i++)
 145       {
 146         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 147       }
 148       acc_buffer_ptr += 16;
 149     }
 150     // Handle 1 output pixel at a time.
 151     for (; outp < num_output_pixels; outp++)
 152     {
 153       // Load the accumulators from acc_buffer.
 154       int32x4_t acc[2];
 155       acc[0] = vld1q_s32(acc_buffer_ptr);
 156       acc[1] = vld1q_s32(acc_buffer_ptr + 4);
 157
 158       // Load the inputs, add input_offset.
 159       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 160       input_ptr += 8;
 161       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 162       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 163       // Multiply-accumulate.
 164       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
 165       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
 166       // Store the accumulators back to acc_buffer
 167       vst1q_s32(acc_buffer_ptr, acc[0]);
 168       vst1q_s32(acc_buffer_ptr + 4, acc[1]);
 169       acc_buffer_ptr += 8;
 170     }
 171   }
 172 };
 173
 174 template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
 175 {
 176   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 177                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 178                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 179   {
 180     (void)input_depth;
 181     (void)depth_multiplier;
 182     (void)input_ptr_increment;
 183     // Load the filters, add filter_offset.
 184     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
 185     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
 186     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 187
 188     int outp = 0;
 189     // Handle 2 output pixels at a time.
 190     for (; outp <= num_output_pixels - 2; outp += 2)
 191     {
 192       // Load the accumulators from acc_buffer
 193       int32x4_t acc[4];
 194       for (int i = 0; i < 4; i++)
 195       {
 196         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 197       }
 198       // Load the inputs, add input_offset.
 199       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 200       input_ptr += 8;
 201       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 202       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 203       // Duplicate the input values, 2-fold
 204       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
 205       // Multiply-accumulate
 206       for (int i = 0; i < 2; i++)
 207       {
 208         acc[2 * i + 0] =
 209             vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
 210         acc[2 * i + 1] =
 211             vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
 212       }
 213       // Store the accumulators back to acc_buffer
 214       for (int i = 0; i < 4; i++)
 215       {
 216         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 217       }
 218       acc_buffer_ptr += 16;
 219     }
 220     // Handle one output pixel at a time.
 221     for (; outp < num_output_pixels; outp++)
 222     {
 223       // Load the accumulators from acc_buffer
 224       int32x4_t acc[2];
 225       for (int i = 0; i < 2; i++)
 226       {
 227         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 228       }
 229       // Load the inputs, add input_offset.
 230       uint8x8_t input_u8 = vdup_n_u8(0);
 231       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 232       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 233       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 234       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 235       input_ptr += 4;
 236       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 237       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 238       // Duplicate the input values, 2-fold
 239       const int16x4x2_t input_dup2 = vzip_s16(input, input);
 240       // Multiply-accumulate
 241       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
 242       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
 243       // Store the accumulators back to acc_buffer
 244       for (int i = 0; i < 2; i++)
 245       {
 246         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 247       }
 248       acc_buffer_ptr += 8;
 249     }
 250   }
 251 };
 252
 253 template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
 254 {
 255   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 256                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 257                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 258   {
 259     (void)input_depth;
 260     (void)depth_multiplier;
 261     (void)input_ptr_increment;
 262     // Load the filters, add filter_offset.
 263     int16x8_t filter[2];
 264     for (int i = 0; i < 2; i++)
 265     {
 266       const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
 267       const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
 268       filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 269     }
 270     int outp = 0;
 271     // Handle two output pixels at a time.
 272     for (; outp <= num_output_pixels - 2; outp += 2)
 273     {
 274       // Load the accumulators from acc_buffer.
 275       int32x4_t acc[8];
 276       for (int i = 0; i < 8; i++)
 277       {
 278         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 279       }
 280       // Load the inputs, add input_offset.
 281       uint8x8_t input_u8 = vdup_n_u8(0);
 282       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 283       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 284       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 285       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 286       input_ptr += 4;
 287       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 288       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 289       // Multiply-accumulate.
 290       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
 291       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
 292       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
 293       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
 294       acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
 295       acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
 296       acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
 297       acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
 298       // Store the accumulators back to acc_buffer.
 299       for (int i = 0; i < 8; i++)
 300       {
 301         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 302       }
 303       acc_buffer_ptr += 32;
 304     }
 305     // Handle one output pixel at a time.
 306     for (; outp < num_output_pixels; outp++)
 307     {
 308       // Load the accumulators from acc_buffer.
 309       int32x4_t acc[4];
 310       for (int i = 0; i < 4; i++)
 311       {
 312         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 313       }
 314       // Load the inputs, add input_offset.
 315       uint8x8_t input_u8 = vdup_n_u8(0);
 316       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 317       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 318       input_ptr += 2;
 319       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 320       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 321
 322       // Multiply-accumulate.
 323       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
 324       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
 325       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
 326       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
 327
 328       // Store the accumulators back to acc_buffer.
 329       for (int i = 0; i < 4; i++)
 330       {
 331         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 332       }
 333       acc_buffer_ptr += 16;
 334     }
 335   }
 336 };
 337
 338 template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
 339 {
 340   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 341                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 342                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 343   {
 344     (void)input_depth;
 345     (void)depth_multiplier;
 346     (void)input_ptr_increment;
 347     // Load the filters, add filter_offset.
 348     uint8x8_t filter_u8 = vdup_n_u8(0);
 349     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 350     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 351     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
 352     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
 353     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 354     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 355
 356     int outp = 0;
 357     // Handle 4 output pixels at a time.
 358     for (; outp <= num_output_pixels - 4; outp += 4)
 359     {
 360       // Load the accumulators from acc_buffer
 361       int32x4_t acc[4];
 362       for (int i = 0; i < 4; i++)
 363       {
 364         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 365       }
 366
 367       // Load the inputs, add input_offset.
 368       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 369       input_ptr += 8;
 370       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 371       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 372       // Duplicate the input values, 2-fold
 373       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
 374       // Multiply-accumulate
 375       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
 376       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
 377       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
 378       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
 379       // Store the accumulators back to acc_buffer
 380       for (int i = 0; i < 4; i++)
 381       {
 382         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 383       }
 384       acc_buffer_ptr += 16;
 385     }
 386     // Handle one output pixel at a time.
 387     for (; outp < num_output_pixels; outp++)
 388     {
 389       // Load the accumulators from acc_buffer
 390       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
 391
 392       uint8x8_t input_u8 = vdup_n_u8(0);
 393       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 394       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 395       input_ptr += 2;
 396       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 397       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 398       // Duplicate the input values, 2-fold
 399       const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
 400       // Multiply-accumulate
 401       acc = vmlal_s16(acc, filter, input_dup2);
 402       // Store the accumulators back to acc_buffer
 403       vst1q_s32(acc_buffer_ptr, acc);
 404       acc_buffer_ptr += 4;
 405     }
 406   }
 407 };
 408
 409 template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
 410 {
 411   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 412                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 413                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 414   {
 415     (void)input_depth;
 416     (void)depth_multiplier;
 417     (void)input_ptr_increment;
 418     // Load the filters, add filter_offset.
 419     uint8x8_t filter_u8 = vdup_n_u8(0);
 420     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 421     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 422     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
 423     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
 424     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 425     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 426
 427     int outp = 0;
 428     // Handle 8 output pixels at a time.
 429     for (; outp <= num_output_pixels - 8; outp += 8)
 430     {
 431       // Load the accumulators from acc_buffer.
 432       int32x4_t acc[4];
 433       for (int i = 0; i < 4; i++)
 434       {
 435         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 436       }
 437       // Load the inputs, add input_offset.
 438       uint8x8_t input_u8[2];
 439       for (int i = 0; i < 2; i++)
 440       {
 441         input_u8[i] = vld1_u8(input_ptr + 8 * i);
 442       }
 443       input_ptr += 16;
 444       int16x8_t input[2];
 445       for (int i = 0; i < 2; i++)
 446       {
 447         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
 448       }
 449       for (int i = 0; i < 2; i++)
 450       {
 451         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
 452       }
 453
 454       // Multiply-accumulate.
 455       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
 456       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
 457       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
 458       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
 459       // Store the accumulators back to acc_buffer.
 460       for (int i = 0; i < 4; i++)
 461       {
 462         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 463       }
 464       acc_buffer_ptr += 16;
 465     }
 466     // Handle 4 output pixels at a time.
 467     for (; outp <= num_output_pixels - 4; outp += 4)
 468     {
 469       // Load the accumulators from acc_buffer.
 470       int32x4_t acc[2];
 471       for (int i = 0; i < 2; i++)
 472       {
 473         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 474       }
 475       // Load the inputs, add input_offset.
 476       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 477       input_ptr += 8;
 478       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 479       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 480
 481       // Multiply-accumulate.
 482       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
 483       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
 484       // Store the accumulators back to acc_buffer.
 485       for (int i = 0; i < 2; i++)
 486       {
 487         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 488       }
 489       acc_buffer_ptr += 8;
 490     }
 491     // Handle 2 output pixels at a time.
 492     for (; outp <= num_output_pixels - 2; outp += 2)
 493     {
 494       // Load the accumulators from acc_buffer.
 495       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
 496       // Load the inputs, add input_offset.
 497       uint8x8_t input_u8 = vdup_n_u8(0);
 498       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 499       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 500       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 501       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 502       input_ptr += 4;
 503       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 504       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 505
 506       // Multiply-accumulate.
 507       acc = vmlal_s16(acc, filter, input);
 508       // Store the accumulators back to acc_buffer.
 509       vst1q_s32(acc_buffer_ptr, acc);
 510       acc_buffer_ptr += 4;
 511     }
 512     // Handle 1 output pixel at a time.
 513     for (; outp < num_output_pixels; outp++)
 514     {
 515       // Load the accumulators from acc_buffer.
 516       int32x2_t acc = vld1_s32(acc_buffer_ptr);
 517       // Load the inputs, add input_offset.
 518       uint8x8_t input_u8 = vdup_n_u8(0);
 519       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 520       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 521       input_ptr += 2;
 522       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 523       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 524
 525       // Multiply-accumulate.
 526       acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
 527       // Store the accumulators back to acc_buffer.
 528       vst1_s32(acc_buffer_ptr, acc);
 529       acc_buffer_ptr += 2;
 530     }
 531   }
 532 };
 533
 534 template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
 535 {
 536   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 537                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 538                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 539   {
 540     (void)input_depth;
 541     (void)depth_multiplier;
 542     (void)input_ptr_increment;
 543     // Load the filters, add filter_offset.
 544     uint8x8_t filter_u8 = vdup_n_u8(0);
 545     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 546     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 547     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
 548     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
 549     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 550     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 551
 552     int outp = 0;
 553     // Handle 8 output pixels at a time.
 554     for (; outp <= num_output_pixels - 8; outp += 8)
 555     {
 556       // Load the accumulators from acc_buffer
 557       int32x4_t acc[4];
 558       for (int i = 0; i < 4; i++)
 559       {
 560         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 561       }
 562
 563       // Load the inputs, add input_offset.
 564       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 565       input_ptr += 8;
 566       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 567       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 568       // Duplicate the input values, 2-fold
 569       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
 570       // Multiply-accumulate
 571       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
 572       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
 573       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
 574       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
 575       // Store the accumulators back to acc_buffer
 576       for (int i = 0; i < 4; i++)
 577       {
 578         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 579       }
 580       acc_buffer_ptr += 16;
 581     }
 582     // Handle one output pixel at a time.
 583     for (; outp < num_output_pixels; outp++)
 584     {
 585       // Load the accumulators from acc_buffer
 586       int32x2_t acc = vld1_s32(acc_buffer_ptr);
 587
 588       // Load the inputs, add input_offset.
 589       const uint32_t input = *input_ptr++ + input_offset;
 590
 591       // Multiply-accumulate
 592       acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
 593       // Store the accumulators back to acc_buffer
 594       vst1_s32(acc_buffer_ptr, acc);
 595       acc_buffer_ptr += 2;
 596     }
 597   }
 598 };
 599
 600 template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
 601 {
 602   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 603                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 604                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 605   {
 606     (void)input_depth;
 607     (void)depth_multiplier;
 608     (void)input_ptr_increment;
 609     // Load the filters, add filter_offset.
 610     uint8x8_t filter_u8 = vdup_n_u8(0);
 611     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 612     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 613     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
 614     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
 615     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 616     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 617
 618     int outp = 0;
 619     // Handle 8 output pixels at a time.
 620     for (; outp <= num_output_pixels - 8; outp += 8)
 621     {
 622       // Load the accumulators from acc_buffer
 623       int32x4_t acc[8];
 624       for (int i = 0; i < 8; i++)
 625       {
 626         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 627       }
 628
 629       // Load the inputs, add input_offset.
 630       uint8x8_t input_u8 = vld1_u8(input_ptr);
 631       input_ptr += 8;
 632       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 633       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 634
 635       // Multiply-accumulate
 636       acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
 637       acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
 638       acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
 639       acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
 640       acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
 641       acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
 642       acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
 643       acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
 644
 645       // Store the accumulators back to acc_buffer
 646       for (int i = 0; i < 8; i++)
 647       {
 648         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 649       }
 650       acc_buffer_ptr += 32;
 651     }
 652     // Handle 4 output pixels at a time.
 653     for (; outp <= num_output_pixels - 4; outp += 4)
 654     {
 655       // Load the accumulators from acc_buffer
 656       int32x4_t acc[4];
 657       for (int i = 0; i < 4; i++)
 658       {
 659         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 660       }
 661
 662       // Load the inputs, add input_offset.
 663       uint8x8_t input_u8 = vdup_n_u8(0);
 664       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 665       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 666       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 667       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 668       input_ptr += 4;
 669       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 670       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 671
 672       // Multiply-accumulate
 673       acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
 674       acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
 675       acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
 676       acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
 677
 678       // Store the accumulators back to acc_buffer
 679       for (int i = 0; i < 4; i++)
 680       {
 681         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 682       }
 683       acc_buffer_ptr += 16;
 684     }
 685     // Handle one output pixel at a time.
 686     for (; outp < num_output_pixels; outp++)
 687     {
 688       // Load the accumulators from acc_buffer
 689       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
 690
 691       // Load the inputs, add input_offset.
 692       const uint32_t input = *input_ptr++ + input_offset;
 693
 694       // Multiply-accumulate
 695       acc = vmlal_n_s16(acc, filter, input);
 696       // Store the accumulators back to acc_buffer
 697       vst1q_s32(acc_buffer_ptr, acc);
 698       acc_buffer_ptr += 4;
 699     }
 700   }
 701 };
 702
 703 template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
 704 {
 705   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 706                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 707                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 708   {
 709     (void)input_depth;
 710     (void)depth_multiplier;
 711     (void)input_ptr_increment;
 712     // Load the filters, add filter_offset.
 713     uint8x8_t filter_u8 = vdup_n_u8(0);
 714     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 715     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 716     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
 717     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
 718     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 719     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 720
 721     int outp = 0;
 722     // Handle 4 output pixels at a time.
 723     for (; outp <= num_output_pixels - 4; outp += 4)
 724     {
 725       // Load the accumulators from acc_buffer
 726       int32x4_t acc[4];
 727       for (int i = 0; i < 4; i++)
 728       {
 729         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 730       }
 731       // Load the inputs, add input_offset.
 732       int16x8_t input[2];
 733       for (int i = 0; i < 2; i++)
 734       {
 735         const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
 736         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 737         input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 738       }
 739       input_ptr += 16;
 740       // Multiply-accumulate
 741       for (int i = 0; i < 2; i++)
 742       {
 743         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
 744         acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
 745       }
 746       // Store the accumulators back to acc_buffer
 747       for (int i = 0; i < 4; i++)
 748       {
 749         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 750       }
 751       acc_buffer_ptr += 16;
 752     }
 753     // Handle one output pixel at a time.
 754     for (; outp < num_output_pixels; outp++)
 755     {
 756       // Load the accumulators from acc_buffer
 757       int32x4_t acc;
 758       acc = vld1q_s32(acc_buffer_ptr);
 759
 760       // Load the inputs, add input_offset.
 761       uint8x8_t input_u8 = vdup_n_u8(0);
 762       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 763       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 764       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 765       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 766       input_ptr += 4;
 767       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 768       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 769       // Multiply-accumulate
 770       acc = vmlal_s16(acc, filter, input);
 771       // Store the accumulators back to acc_buffer
 772       vst1q_s32(acc_buffer_ptr, acc);
 773       acc_buffer_ptr += 4;
 774     }
 775   }
 776 };
 777
 778 template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
 779 {
 780   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 781                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 782                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 783   {
 784     (void)input_depth;
 785     (void)depth_multiplier;
 786     (void)input_ptr_increment;
 787     // Load the filters, add filter_offset.
 788     int16x8_t filter[2];
 789     for (int i = 0; i < 2; i++)
 790     {
 791       const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
 792       const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
 793       filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 794     }
 795
 796     int outp = 0;
 797     // Handle 2 output pixels at a time.
 798     for (; outp <= num_output_pixels - 2; outp += 2)
 799     {
 800       // Load the accumulators from acc_buffer
 801       int32x4_t acc[8];
 802       for (int i = 0; i < 8; i++)
 803       {
 804         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 805       }
 806
 807       // Load the inputs, add input_offset.
 808       uint8x8_t input_u8 = vld1_u8(input_ptr);
 809       input_ptr += 8;
 810       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 811       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 812
 813       // Multiply-accumulate
 814       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);
 815       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);
 816       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);
 817       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);
 818       acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);
 819       acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);
 820       acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);
 821       acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);
 822       // Store the accumulators back to acc_buffer
 823       for (int i = 0; i < 8; i++)
 824       {
 825         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 826       }
 827       acc_buffer_ptr += 32;
 828     }
 829     // Handle one output pixel at a time.
 830     for (; outp < num_output_pixels; outp++)
 831     {
 832       // Load the accumulators from acc_buffer
 833       int32x4_t acc[4];
 834       for (int i = 0; i < 4; i++)
 835       {
 836         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 837       }
 838
 839       // Load the inputs, add input_offset.
 840       uint8x8_t input_u8 = vdup_n_u8(0);
 841       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 842       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 843       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 844       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 845       input_ptr += 4;
 846       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 847       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 848
 849       // Multiply-accumulate
 850       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
 851       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
 852       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
 853       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
 854       // Store the accumulators back to acc_buffer
 855       for (int i = 0; i < 4; i++)
 856       {
 857         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 858       }
 859       acc_buffer_ptr += 16;
 860     }
 861   }
 862 };
 863
 864 template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
 865 {
 866   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 867                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 868                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 869   {
 870     (void)input_depth;
 871     (void)depth_multiplier;
 872     // We will have to duplicate bytes in a NEON register, 3-fold.
 873     // We will do that by register-level table-look-up using VTBL instructions.
 874     // Here we prepare the registers containing the table-lookup indices.
 875     static const uint8_t dup3_indices_array[3][8] = {
 876         {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
 877     uint8x8_t dup3_indices[3];
 878     for (int i = 0; i < 3; i++)
 879     {
 880       dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
 881     }
 882
 883     // Handle one output pixel at a time.
 884     for (int outp = 0; outp < num_output_pixels; outp++)
 885     {
 886       const uint8_t *local_filter_ptr = filter_ptr;
 887       const uint8_t *local_input_ptr = input_ptr;
 888       int ic = 0;
 889       // Handle 8 input channels at a time.
 890       for (; ic <= input_depth - 8; ic += 8)
 891       {
 892         // Load the filters, add filter_offset.
 893         int16x8_t filter[3];
 894         uint8x8x3_t filter_u8;
 895         filter_u8.val[0] = vld1_u8(local_filter_ptr);
 896         filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
 897         filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
 898         local_filter_ptr += 24;
 899         for (int i = 0; i < 3; i++)
 900         {
 901           const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
 902           filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 903         }
 904         // Load the inputs, duplicate 3-fold, add input_offset.
 905         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
 906         local_input_ptr += 8;
 907
 908         uint8x8_t input_u8_dup3[3];
 909         for (int i = 0; i < 3; i++)
 910         {
 911           input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
 912         }
 913         int16x8_t input_dup3[3];
 914         for (int i = 0; i < 3; i++)
 915         {
 916           const int16x8_t input_s16_dup3 = vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
 917           input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
 918         }
 919         // Load the accumulators from acc_buffer
 920         int32x4x3_t acc[2];
 921         for (int i = 0; i < 2; i++)
 922         {
 923           acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
 924           acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
 925           acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
 926         }
 927         // Multiply-accumulate
 928         for (int j = 0; j < 3; j++)
 929         {
 930           acc[0].val[j] =
 931               vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
 932           acc[1].val[j] =
 933               vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
 934         }
 935         // Store the accumulators back to acc_buffer
 936         for (int i = 0; i < 2; i++)
 937         {
 938           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
 939           vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
 940           vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
 941         }
 942         acc_buffer_ptr += 24;
 943       }
 944       // Handle one input channel at a time.
 945       for (; ic < input_depth; ic++)
 946       {
 947         const uint16_t input_val = *local_input_ptr++ + input_offset;
 948         for (int i = 0; i < 3; i++)
 949         {
 950           const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
 951           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
 952         }
 953         local_filter_ptr += 3;
 954       }
 955       input_ptr += input_ptr_increment;
 956     }
 957   }
 958 };
 959
 960 template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
 961 {
 962   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 963                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
 964                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
 965   {
 966     (void)input_depth;
 967     (void)depth_multiplier;
 968     // Handle one output pixel at a time.
 969     for (int outp = 0; outp < num_output_pixels; outp++)
 970     {
 971       const uint8_t *local_filter_ptr = filter_ptr;
 972       const uint8_t *local_input_ptr = input_ptr;
 973       int ic = 0;
 974       // Handle 8 input channels at a time.
 975       for (; ic <= input_depth - 8; ic += 8)
 976       {
 977         // Load the filters, add filter_offset.
 978         int16x8_t filter[2];
 979         uint8x8x2_t filter_u8;
 980         filter_u8.val[0] = vld1_u8(local_filter_ptr);
 981         filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
 982         local_filter_ptr += 16;
 983         for (int i = 0; i < 2; i++)
 984         {
 985           const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
 986           filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 987         }
 988         // Load the inputs, add input_offset, duplicate 2-fold.
 989         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
 990         local_input_ptr += 8;
 991         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 992         const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 993         const int16x8x2_t input_dup2 = vzipq_s16(input, input);
 994         // Load the accumulators from acc_buffer.
 995         int32x4x2_t acc[2];
 996         for (int i = 0; i < 2; i++)
 997         {
 998           acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
 999           acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
1000         }
1001         // Multiply-accumulate.
1002         for (int j = 0; j < 2; j++)
1003         {
1004           acc[0].val[j] =
1005               vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
1006           acc[1].val[j] =
1007               vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
1008         }
1009         // Store the accumulators back to acc_buffer.
1010         for (int i = 0; i < 2; i++)
1011         {
1012           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
1013           vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
1014         }
1015         acc_buffer_ptr += 16;
1016       }
1017       // Handle one input channel at a time.
1018       for (; ic < input_depth; ic++)
1019       {
1020         // Load the inputs.
1021         const uint16_t input_val = *local_input_ptr++ + input_offset;
1022         for (int i = 0; i < 2; i++)
1023         {
1024           const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
1025           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1026         }
1027         local_filter_ptr += 2;
1028       }
1029       input_ptr += input_ptr_increment;
1030     }
1031   }
1032 };
1033
1034 template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
1035 {
1036   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1037                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1038                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1039   {
1040     (void)input_depth;
1041     (void)depth_multiplier;
1042     // Handle one output pixel at a time.
1043     for (int outp = 0; outp < num_output_pixels; outp++)
1044     {
1045       const uint8_t *local_filter_ptr = filter_ptr;
1046       const uint8_t *local_input_ptr = input_ptr;
1047       int ic = 0;
1048       // Handle 16 input channels at a time.
1049       for (; ic <= input_depth - 16; ic += 16)
1050       {
1051         // Load the filters, add filter_offset.
1052         uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
1053         uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
1054         local_filter_ptr += 16;
1055         int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1056         int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1057         filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1058         filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1059         // Load the inputs, add input_offset.
1060         uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
1061         uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
1062         local_input_ptr += 16;
1063         int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
1064         int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
1065         input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
1066         input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
1067         // Load the accumulators from acc_buffer
1068         int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1069         int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1070         int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1071         int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1072         acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
1073         acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
1074         acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
1075         acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
1076         // Store the accumulators back to acc_buffer
1077         vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1078         vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1079         vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1080         vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1081         acc_buffer_ptr += 16;
1082       }
1083       // Handle 8 input channels at a time.
1084       for (; ic <= input_depth - 8; ic += 8)
1085       {
1086         // Load the filters, add filter_offset.
1087         const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
1088         local_filter_ptr += 8;
1089         const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
1090         const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
1091         // Load the inputs, add input_offset.
1092         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
1093         local_input_ptr += 8;
1094         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1095         const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1096         // Load the accumulators from acc_buffer
1097         int32x4_t acc[2];
1098         for (int i = 0; i < 2; i++)
1099         {
1100           acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1101         }
1102         // Multiply-accumulate
1103         acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
1104         acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
1105         // Store the accumulators back to acc_buffer
1106         for (int i = 0; i < 2; i++)
1107         {
1108           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1109         }
1110         acc_buffer_ptr += 8;
1111       }
1112       // Handle one input channel at a time.
1113       for (; ic < input_depth; ic++)
1114       {
1115         const uint16_t input_val = *local_input_ptr++ + input_offset;
1116         const uint16_t filter_val = *local_filter_ptr++ + filter_offset;
1117         *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1118       }
1119       input_ptr += input_ptr_increment;
1120     }
1121   }
1122 };
1123
1124 template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
1125 {
1126   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1127                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1128                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1129   {
1130     (void)input_depth;
1131     (void)depth_multiplier;
1132     // Load the filters, add filter_offset.
1133     uint8x8_t filter_u8[2];
1134     for (int i = 0; i < 2; i++)
1135     {
1136       filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
1137     }
1138     int16x8_t filter[2];
1139     for (int i = 0; i < 2; i++)
1140     {
1141       filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
1142     }
1143     for (int i = 0; i < 2; i++)
1144     {
1145       filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
1146     }
1147     // Handle one output pixel at a time.
1148     for (int outp = 0; outp < num_output_pixels; outp++)
1149     {
1150       // Load the inputs, add input_offset.
1151       uint8x8_t input_u8[2];
1152       for (int i = 0; i < 2; i++)
1153       {
1154         input_u8[i] = vld1_u8(input_ptr + 8 * i);
1155       }
1156       input_ptr += input_ptr_increment;
1157       int16x8_t input[2];
1158       for (int i = 0; i < 2; i++)
1159       {
1160         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
1161       }
1162       for (int i = 0; i < 2; i++)
1163       {
1164         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
1165       }
1166       // Load the accumulators from acc_buffer
1167       int32x4_t acc[4];
1168       for (int i = 0; i < 4; i++)
1169       {
1170         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1171       }
1172       // Multiply-accumulate
1173       for (int i = 0; i < 2; i++)
1174       {
1175         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
1176         acc[2 * i + 1] =
1177             vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
1178       }
1179       // Store the accumulators back to acc_buffer
1180       for (int i = 0; i < 4; i++)
1181       {
1182         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1183       }
1184       acc_buffer_ptr += 16;
1185     }
1186   }
1187 };
1188
1189 template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
1190 {
1191   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1192                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1193                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1194   {
1195     (void)input_depth;
1196     (void)depth_multiplier;
1197     // Load the filters, add filter_offset.
1198     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
1199     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
1200     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
1201     // Handle one output pixel at a time.
1202     for (int outp = 0; outp < num_output_pixels; outp++)
1203     {
1204       // Load the inputs, add input_offset.
1205       const uint8x8_t input_u8 = vld1_u8(input_ptr);
1206       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1207       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1208       // Load the accumulators from acc_buffer
1209       int32x4_t acc[2];
1210       for (int i = 0; i < 2; i++)
1211       {
1212         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1213       }
1214       // Multiply-accumulate
1215       acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
1216       acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
1217       // Store the accumulators back to acc_buffer
1218       for (int i = 0; i < 2; i++)
1219       {
1220         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1221       }
1222       acc_buffer_ptr += 8;
1223       input_ptr += input_ptr_increment;
1224     }
1225   }
1226 };
1227
1228 template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
1229 {
1230   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1231                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1232                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1233   {
1234     (void)input_depth;
1235     (void)depth_multiplier;
1236     // Load the filters, add filter_offset.
1237     uint8x8_t filter_u8[2];
1238     for (int i = 0; i < 2; i++)
1239     {
1240       filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
1241     }
1242     int16x8_t filter[2];
1243     for (int i = 0; i < 2; i++)
1244     {
1245       filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
1246     }
1247     for (int i = 0; i < 2; i++)
1248     {
1249       filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
1250     }
1251     // Handle one output pixel at a time.
1252     for (int outp = 0; outp < num_output_pixels; outp++)
1253     {
1254       uint8_t input_u8 = *input_ptr;
1255       input_ptr += input_ptr_increment;
1256       uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
1257       // Load the accumulators from acc_buffer
1258       int32x4_t acc[4];
1259       for (int i = 0; i < 4; i++)
1260       {
1261         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1262       }
1263       // Multiply-accumulate
1264       for (int i = 0; i < 2; i++)
1265       {
1266         acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
1267         acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
1268       }
1269       // Store the accumulators back to acc_buffer
1270       for (int i = 0; i < 4; i++)
1271       {
1272         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1273       }
1274       acc_buffer_ptr += 16;
1275     }
1276   }
1277 };
1278
1279 template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
1280 {
1281   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1282                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1283                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1284   {
1285     (void)input_depth;
1286     (void)depth_multiplier;
1287     // Load the filters, add filter_offset.
1288     uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
1289     uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
1290     uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
1291     uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
1292     int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1293     int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1294     int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
1295     int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
1296     filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1297     filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1298     filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
1299     filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
1300     // Handle one output pixel at a time.
1301     for (int outp = 0; outp < num_output_pixels; outp++)
1302     {
1303       uint8_t input_u8 = *input_ptr;
1304       input_ptr += input_ptr_increment;
1305       uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
1306       // Load the accumulators from acc_buffer
1307       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1308       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1309       int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1310       int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1311       int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
1312       int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
1313       int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
1314       int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
1315       // Multiply-accumulate
1316       acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
1317       acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
1318       acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
1319       acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
1320       acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
1321       acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
1322       acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
1323       acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
1324       // Store the accumulators back to acc_buffer
1325       vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1326       vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1327       vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1328       vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1329       vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
1330       vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
1331       vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
1332       vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
1333       acc_buffer_ptr += 32;
1334     }
1335   }
1336 };
1337
1338 template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
1339 {
1340   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1341                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1342                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1343   {
1344     (void)input_depth;
1345     (void)depth_multiplier;
1346     // Load the filters, add filter_offset.
1347     // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
1348     // We load the first 16 bytes into filter_u8_{0,1} as usual.
1349     // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
1350     // This is redundant: the first 4 bytes of filter_u8_x are the same
1351     // as the last 4 bytes of filter_u8_x.
1352     uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
1353     uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
1354     uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
1355     int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1356     int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1357     int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
1358     filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1359     filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1360     filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
1361     // Handle one output pixel at a time.
1362     for (int outp = 0; outp < num_output_pixels; outp++)
1363     {
1364       uint8_t input_u8 = *input_ptr;
1365       input_ptr += input_ptr_increment;
1366       uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
1367       // Load the accumulators from acc_buffer
1368       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1369       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1370       int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1371       int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1372       int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
1373       // Multiply-accumulate
1374       acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
1375       acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
1376       acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
1377       acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
1378       acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
1379       // Store the accumulators back to acc_buffer
1380       vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1381       vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1382       vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1383       vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1384       vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
1385       acc_buffer_ptr += 20;
1386     }
1387   }
1388 };
1389
1390 template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
1391 {
1392   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1393                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1394                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1395   {
1396     (void)input_depth;
1397     (void)depth_multiplier;
1398     // Load the filters, add filter_offset.
1399     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
1400     const int16x8_t filter =
1401         vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
1402     // Handle one output pixel at a time.
1403     for (int outp = 0; outp < num_output_pixels; outp++)
1404     {
1405       uint8_t input_u8 = *input_ptr;
1406       input_ptr += input_ptr_increment;
1407       uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
1408       // Load the accumulators from acc_buffer
1409       int32x4_t acc[2];
1410       for (int i = 0; i < 2; i++)
1411       {
1412         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1413       }
1414       // Multiply-accumulate
1415       acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
1416       acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
1417       // Store the accumulators back to acc_buffer
1418       for (int i = 0; i < 2; i++)
1419       {
1420         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1421       }
1422       acc_buffer_ptr += 8;
1423     }
1424   }
1425 };
1426
1427 template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
1428 {
1429   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1430                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1431                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1432   {
1433     (void)input_depth;
1434     (void)depth_multiplier;
1435     // Load the filters, add filter_offset.
1436     uint8x8_t filter_u8 = vdup_n_u8(0);
1437     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
1438     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
1439     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
1440     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
1441     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
1442     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
1443
1444     int outp = 0;
1445
1446     // Handle 2 output pixels at a time.
1447     for (; outp <= num_output_pixels - 2; outp += 2)
1448     {
1449       // Load the accumulators from acc_buffer.
1450       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
1451       // Load the inputs, add input_offset.
1452       uint16x4_t input_u16 = vdup_n_u16(0);
1453       input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 0);
1454       input_ptr += input_ptr_increment;
1455       input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);
1456       input_ptr += input_ptr_increment;
1457       const int16x4_t input_s16 =
1458           vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
1459       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1460
1461       // Multiply-accumulate.
1462       acc = vmlal_s16(acc, filter, input);
1463       // Store the accumulators back to acc_buffer.
1464       vst1q_s32(acc_buffer_ptr, acc);
1465       acc_buffer_ptr += 4;
1466     }
1467
1468     // Handle 1 output pixel at a time.
1469     for (; outp < num_output_pixels; outp++)
1470     {
1471       // Load the accumulators from acc_buffer.
1472       int32x2_t acc = vld1_s32(acc_buffer_ptr);
1473       // Load the inputs, add input_offset.
1474       uint8x8_t input_u8 = vdup_n_u8(0);
1475       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
1476       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
1477       input_ptr += input_ptr_increment;
1478       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1479       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1480
1481       // Multiply-accumulate.
1482       acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
1483       // Store the accumulators back to acc_buffer.
1484       vst1_s32(acc_buffer_ptr, acc);
1485       acc_buffer_ptr += 2;
1486     }
1487   }
1488 };
1489
1490 template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
1491 {
1492   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1493                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1494                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1495   {
1496     (void)input_depth;
1497     (void)depth_multiplier;
1498     if (num_output_pixels <= 0)
1499     {
1500       return;
1501     }
1502
1503     // Load the filters, add filter_offset.
1504     uint8x8_t filter_u8 = vdup_n_u8(0);
1505     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
1506     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
1507     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
1508     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
1509     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
1510     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
1511
1512     int outp = 0;
1513
1514     // Handle one output pixel at a time until second to the last pixel. Second
1515     // to the last because we read eight input pixels while only processing
1516     // four.
1517     for (; outp < num_output_pixels - 1; outp++)
1518     {
1519       // Load the accumulators from acc_buffer
1520       int32x4_t acc;
1521       acc = vld1q_s32(acc_buffer_ptr);
1522
1523       // Load the inputs, add input_offset.
1524       uint8x8_t input_u8 = vld1_u8(input_ptr);
1525       input_ptr += input_ptr_increment;
1526       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1527       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1528       // Multiply-accumulate
1529       acc = vmlal_s16(acc, filter, input);
1530       // Store the accumulators back to acc_buffer
1531       vst1q_s32(acc_buffer_ptr, acc);
1532       acc_buffer_ptr += 4;
1533     }
1534
1535     // Handle the last output pixel.
1536     // Load the accumulators from acc_buffer
1537     int32x4_t acc;
1538     acc = vld1q_s32(acc_buffer_ptr);
1539
1540     // Load the inputs, add input_offset.
1541     uint8x8_t input_u8 = vdup_n_u8(0);
1542     input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
1543     input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
1544     input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
1545     input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
1546     const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1547     const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1548     // Multiply-accumulate
1549     acc = vmlal_s16(acc, filter, input);
1550     // Store the accumulators back to acc_buffer
1551     vst1q_s32(acc_buffer_ptr, acc);
1552   }
1553 };
1554
1555 template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
1556 {
1557   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1558                   const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1559                   const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1560   {
1561     (void)input_depth;
1562     (void)depth_multiplier;
1563     // Load the filters, add filter_offset.
1564     uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
1565     uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
1566     int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1567     int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1568     filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
1569     filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
1570     int16x4_t filter_0 = vget_low_s16(filter_s16_0);
1571     int16x4_t filter_1 = vget_high_s16(filter_s16_0);
1572     int16x4_t filter_2 = vget_high_s16(filter_s16_1);
1573
1574     // Handle one output pixel at a time.
1575     for (int outp = 0; outp < num_output_pixels; outp++)
1576     {
1577       // Load the inputs, add input_offset.
1578       uint8x8_t input_u8_0 = vld1_u8(input_ptr);
1579       uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
1580       input_ptr += input_ptr_increment;
1581       int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
1582       int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
1583       input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
1584       input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
1585
1586       // Load the accumulators from acc_buffer
1587       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1588       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1589       int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1590
1591       // Multiply-accumulate
1592       acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
1593       acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
1594       acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
1595
1596       // Store the accumulators back to acc_buffer
1597       vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1598       vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1599       vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1600
1601       acc_buffer_ptr += 12;
1602     }
1603   }
1604 };
1605 #endif
1606
1607 // Accumulates the effect of one row of the filter, on a segment of one row
1608 // of the output, accessing the corresponding one row of the input.
1609 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
1610 void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,
1611                                     int input_width, const uint8_t *input_data,
1612                                     int16_t input_offset, int pad_width, int depth_multiplier,
1613                                     int filter_width, const uint8_t *filter_data,
1614                                     int16_t filter_offset, int out_x_buffer_start,
1615                                     int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
1616 {
1617   // Sanity check parameters. This is important in particular to ensure
1618   // that we keep the number of template instantiations minimal, so we don't
1619   // increase binary size unnecessarily.
1620   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
1621   static_assert(kFixedInputDepth || kAllowStrided, "");
1622   assert(stride == 1 || kAllowStrided);
1623   if (kFixedInputDepth)
1624   {
1625     assert(input_depth == kFixedInputDepth);
1626   }
1627   if (kFixedDepthMultiplier)
1628   {
1629     assert(depth_multiplier == kFixedDepthMultiplier);
1630   }
1631   assert(output_depth == input_depth * depth_multiplier);
1632   const int input_ptr_increment = stride * input_depth;
1633   const uint8_t *filter_base_ptr = filter_data;
1634   for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1635   {
1636     // For the current (filter_x, filter_y) point in the filter,
1637     // compute the boundaries of the corresponding output row segment.
1638     int out_x_loop_start_unclampled = 0;
1639     int out_x_loop_end_unclampled = 0;
1640     if (kAllowStrided)
1641     {
1642       if (stride == 2)
1643       {
1644         out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 1) / 2;
1645         out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
1646       }
1647       else if (stride == 4)
1648       {
1649         out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 3) / 4;
1650         out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
1651       }
1652       else
1653       {
1654         out_x_loop_start_unclampled =
1655             (pad_width - dilation_factor * filter_x + stride - 1) / stride;
1656         out_x_loop_end_unclampled =
1657             (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
1658       }
1659     }
1660     else
1661     {
1662       out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
1663       out_x_loop_end_unclampled = pad_width + input_width - dilation_factor * filter_x;
1664     }
1665     // The kernel will have to iterate on the segment of the
1666     // output row that starts at out_x_loop_start and out_x_loop_end.
1667     const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled);
1668     const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled);
1669
1670     int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1671     const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1672     const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1673     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
1674     QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
1675         num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
1676         input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
1677     filter_base_ptr += output_depth;
1678   }
1679 }
1680
1681 // generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
1682 inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
1683                                                   int input_width, const uint8_t *input_data,
1684                                                   int16_t input_offset, int pad_width,
1685                                                   int depth_multiplier, int filter_width,
1686                                                   const uint8_t *filter_data, int16_t filter_offset,
1687                                                   int out_x_buffer_start, int out_x_buffer_end,
1688                                                   int output_depth, int32_t *acc_buffer)
1689 {
1690   const uint8_t *filter_base_ptr = filter_data;
1691   for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1692   {
1693     const int out_x_loop_start = std::max(
1694         out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
1695     const int out_x_loop_end =
1696         std::min(out_x_buffer_end,
1697                  (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
1698
1699     int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1700     const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1701     const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1702     const int input_ptr_increment = (stride - 1) * input_depth;
1703     for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
1704     {
1705       const uint8_t *filter_ptr = filter_base_ptr;
1706       for (int ic = 0; ic < input_depth; ++ic)
1707       {
1708         const int16_t input_val = *input_ptr++ + input_offset;
1709         for (int m = 0; m < depth_multiplier; m++)
1710         {
1711           const int16_t filter_val = *filter_ptr++ + filter_offset;
1712           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1713         }
1714       }
1715       input_ptr += input_ptr_increment;
1716     }
1717     filter_base_ptr += output_depth;
1718   }
1719 }
1720
1721 // Initializes the accumulator buffer with bias values.
1722 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
1723                                        const int32_t *bias_data, int32_t *acc_buffer)
1724 {
1725   int i = 0;
1726 #ifdef USE_NEON
1727   if (output_depth == 1)
1728   {
1729     const int32x4_t b = vdupq_n_s32(bias_data[0]);
1730     for (; i <= num_output_pixels - 16; i += 16)
1731     {
1732       vst1q_s32(acc_buffer + i + 0, b);
1733       vst1q_s32(acc_buffer + i + 4, b);
1734       vst1q_s32(acc_buffer + i + 8, b);
1735       vst1q_s32(acc_buffer + i + 12, b);
1736     }
1737     for (; i <= num_output_pixels - 4; i += 4)
1738     {
1739       vst1q_s32(acc_buffer + i, b);
1740     }
1741   }
1742   else if (output_depth == 2)
1743   {
1744     int32x4_t b = vdupq_n_s32(bias_data[0]);
1745     b = vsetq_lane_s32(bias_data[1], b, 1);
1746     b = vsetq_lane_s32(bias_data[1], b, 3);
1747     for (; i <= num_output_pixels - 8; i += 8)
1748     {
1749       vst1q_s32(acc_buffer + 2 * i + 0, b);
1750       vst1q_s32(acc_buffer + 2 * i + 4, b);
1751       vst1q_s32(acc_buffer + 2 * i + 8, b);
1752       vst1q_s32(acc_buffer + 2 * i + 12, b);
1753     }
1754     for (; i <= num_output_pixels - 2; i += 2)
1755     {
1756       vst1q_s32(acc_buffer + 2 * i, b);
1757     }
1758   }
1759   else if (output_depth == 4)
1760   {
1761     const int32x4_t b = vld1q_s32(bias_data);
1762     for (; i <= num_output_pixels - 4; i += 4)
1763     {
1764       vst1q_s32(acc_buffer + 4 * i + 0, b);
1765       vst1q_s32(acc_buffer + 4 * i + 4, b);
1766       vst1q_s32(acc_buffer + 4 * i + 8, b);
1767       vst1q_s32(acc_buffer + 4 * i + 12, b);
1768     }
1769     for (; i < num_output_pixels; i++)
1770     {
1771       vst1q_s32(acc_buffer + 4 * i, b);
1772     }
1773   }
1774   else if (output_depth == 8)
1775   {
1776     const int32x4_t b0 = vld1q_s32(bias_data);
1777     const int32x4_t b1 = vld1q_s32(bias_data + 4);
1778     for (; i <= num_output_pixels - 2; i += 2)
1779     {
1780       vst1q_s32(acc_buffer + 8 * i + 0, b0);
1781       vst1q_s32(acc_buffer + 8 * i + 4, b1);
1782       vst1q_s32(acc_buffer + 8 * i + 8, b0);
1783       vst1q_s32(acc_buffer + 8 * i + 12, b1);
1784     }
1785     for (; i < num_output_pixels; i++)
1786     {
1787       vst1q_s32(acc_buffer + 8 * i + 0, b0);
1788       vst1q_s32(acc_buffer + 8 * i + 4, b1);
1789     }
1790   }
1791   else if (output_depth == 16)
1792   {
1793     const int32x4_t b0 = vld1q_s32(bias_data);
1794     const int32x4_t b1 = vld1q_s32(bias_data + 4);
1795     const int32x4_t b2 = vld1q_s32(bias_data + 8);
1796     const int32x4_t b3 = vld1q_s32(bias_data + 12);
1797     for (; i < num_output_pixels; i++)
1798     {
1799       vst1q_s32(acc_buffer + 16 * i + 0, b0);
1800       vst1q_s32(acc_buffer + 16 * i + 4, b1);
1801       vst1q_s32(acc_buffer + 16 * i + 8, b2);
1802       vst1q_s32(acc_buffer + 16 * i + 12, b3);
1803     }
1804   }
1805 #endif
1806   for (; i < num_output_pixels; i++)
1807   {
1808     memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
1809   }
1810 }
1811
1812 inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape &input_shape,
1813                                  const uint8_t *input_data, const Shape &filter_shape,
1814                                  const uint8_t *filter_data, const Shape &bias_shape,
1815                                  const int32_t *bias_data, const Shape &output_shape,
1816                                  uint8_t *output_data)
1817 {
1818   (void)bias_shape;
1819   const int stride_width = params.stride_width;
1820   const int stride_height = params.stride_height;
1821   const int pad_width = params.padding_values.width;
1822   const int pad_height = params.padding_values.height;
1823   const int depth_multiplier = params.depth_multiplier;
1824   const int32_t output_activation_min = params.quantized_activation_min;
1825   const int32_t output_activation_max = params.quantized_activation_max;
1826   const int32_t input_offset = params.input_offset;
1827   const int32_t filter_offset = params.weights_offset;
1828   const int32_t output_offset = params.output_offset;
1829   const int32_t output_multiplier = params.output_multiplier;
1830   const int output_shift = params.output_shift;
1831   const int dilation_width_factor = params.dilation_width_factor;
1832   const int dilation_height_factor = params.dilation_height_factor;
1833   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
1834   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1835   const int input_height = input_shape.Dims(1);
1836   const int input_width = input_shape.Dims(2);
1837   const int input_depth = input_shape.Dims(3);
1838   const int filter_height = filter_shape.Dims(1);
1839   const int filter_width = filter_shape.Dims(2);
1840   const int output_height = output_shape.Dims(1);
1841   const int output_width = output_shape.Dims(2);
1842 #ifdef USE_NEON
1843   const bool shift_left = (output_shift > 0);
1844   const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
1845 #endif
1846
1847   static const int kAccBufferMaxSize = 2048;
1848   int32_t acc_buffer[kAccBufferMaxSize];
1849   assert(kAccBufferMaxSize >= output_depth);
1850   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1851   const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1852   assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1853   assert(kAccBufferActualSize <= kAccBufferMaxSize);
1854   assert(kOutputPixelsInAccBuffer >= 1);
1855   UNUSED_RELEASE(kAccBufferActualSize);
1856
1857   // row_accum_func will point to the core accumulation function to be used
1858   // for this DepthwiseConv op.
1859   using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
1860   row_accum_func_t row_accum_func = nullptr;
1861
1862 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1863   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
1864       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
1865       depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
1866   {                                                                                               \
1867     row_accum_func =                                                                              \
1868         QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
1869   }
1870
1871 #ifdef USE_NEON
1872   // We go over our list of kernels by decreasing order of preference
1873   // for the cases where multiple kernels could apply.
1874
1875   // Start with the fastest kernels: AllowStrided=false, fixed input depth.
1876
1877   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
1878   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
1879   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
1880   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
1881   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
1882   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
1883   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
1884   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
1885   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
1886   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
1887
1888   // Next come the strided kernels: AllowStrided=true, fixed input depth.
1889   // They are a bit less efficient, but allow stride!=1.
1890
1891   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
1892   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
1893   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
1894   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
1895   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
1896   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
1897   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
1898   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
1899   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
1900
1901   // Finally, the kernels allowing a variable input depth,
1902   // these are the least efficient but most general kernels.
1903
1904   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
1905   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
1906   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
1907 #endif // USE_NEON
1908
1909   // No matching fast kernel found, use slow fallback.
1910   if (!row_accum_func)
1911   {
1912     row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
1913   }
1914
1915 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
1916
1917   const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
1918   const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1919   const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1920
1921   // Now that we have determined row_accum_func, we can start work.
1922   uint8_t *output_ptr = output_data;
1923   for (int b = 0; b < batches; ++b)
1924   {
1925     for (int out_y = 0; out_y < output_height; ++out_y)
1926     {
1927       const int in_y_origin = (out_y * stride_height) - pad_height;
1928       const int filter_y_start =
1929           std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1930       const int filter_y_end =
1931           std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1932                                       dilation_height_factor);
1933       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1934            out_x_buffer_start += kOutputPixelsInAccBuffer)
1935       {
1936         const int out_x_buffer_end =
1937             std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1938         // We call a 'pixel' a group of activation that share all but the
1939         // 'depth'/'channel' coordinate. num_output_pixels is the number of
1940         // output pixels that we will accumulate in this loop iteration.
1941         const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1942         // Initialize our local accumulator with the bias values, so we don't
1943         // have to add them later.
1944         DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1945         // Accumulation loop. Most of the time should be spent in here.
1946         for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1947         {
1948           const int in_y = in_y_origin + dilation_height_factor * filter_y;
1949           row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1950                          input_data + in_y * input_height_stride + b * input_batch_stride,
1951                          input_offset, pad_width, depth_multiplier, filter_width,
1952                          filter_data + filter_y * filter_height_stride, filter_offset,
1953                          out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
1954         }
1955         // Finished accumulating int32 values. Now need to convert them to
1956         // the final 8bit form and store them.
1957         const int num_output_values = output_depth * num_output_pixels;
1958         int i = 0;
1959 #ifdef USE_NEON
1960         using gemmlowp::RoundingDivideByPOT;
1961         const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
1962         const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
1963         const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
1964         // Handle 16 values at once.
1965         // This allows us to issue 4 mutually independent int32
1966         // multiplications (vqrdmulh), which should alleviate most of their
1967         // high latency.
1968         for (; i <= num_output_values - 16; i += 16)
1969         {
1970           int32x4_t acc[4];
1971           for (int j = 0; j < 4; j++)
1972           {
1973             acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
1974           }
1975
1976           if (!shift_left)
1977           {
1978             // Fixed-point multiplication.
1979             for (int j = 0; j < 4; j++)
1980             {
1981               acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
1982             }
1983             for (int j = 0; j < 4; j++)
1984             {
1985               acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
1986             }
1987           }
1988           else
1989           {
1990             // Fixed-point multiplication.
1991             for (int j = 0; j < 4; j++)
1992             {
1993               acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
1994               acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
1995             }
1996           }
1997           // Add the output offset.
1998           for (int j = 0; j < 4; j++)
1999           {
2000             acc[j] = vaddq_s32(acc[j], output_offset_vec);
2001           }
2002           // Apply the activation function.
2003           for (int j = 0; j < 4; j++)
2004           {
2005             acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
2006           }
2007           for (int j = 0; j < 4; j++)
2008           {
2009             acc[j] = vminq_s32(acc[j], output_activation_max_vec);
2010           }
2011           // Saturating cast to uint8_t and store to destination.
2012           int16x4_t acc_s16[4];
2013           for (int j = 0; j < 4; j++)
2014           {
2015             acc_s16[j] = vqmovn_s32(acc[j]);
2016           }
2017           const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
2018           const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
2019           const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
2020           const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
2021           vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
2022           output_ptr += 16;
2023         }
2024         // Handle 8 values at once.
2025         // Not as good as 16 (now we're only issuing 2 mutually independent
2026         // vqrdmulh instructions, so we're probably paying for their high
2027         // latency).
2028         for (; i <= num_output_values - 8; i += 8)
2029         {
2030           int32x4_t acc0 = vld1q_s32(acc_buffer + i);
2031           int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
2032           if (!shift_left)
2033           {
2034             // Fixed-point multiplication.
2035             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2036             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2037             // Rounding right shift.
2038             acc0 = RoundingDivideByPOT(acc0, -output_shift);
2039             acc1 = RoundingDivideByPOT(acc1, -output_shift);
2040           }
2041           else
2042           {
2043             // Fixed-point multiplication.
2044             acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
2045             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2046
2047             acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
2048             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2049           }
2050           // Add the output offset.
2051           acc0 = vaddq_s32(acc0, output_offset_vec);
2052           acc1 = vaddq_s32(acc1, output_offset_vec);
2053           // Apply the activation function.
2054           acc0 = vmaxq_s32(acc0, output_activation_min_vec);
2055           acc1 = vmaxq_s32(acc1, output_activation_min_vec);
2056           acc0 = vminq_s32(acc0, output_activation_max_vec);
2057           acc1 = vminq_s32(acc1, output_activation_max_vec);
2058           // Saturating cast to uint8_t and store to destination.
2059           const int16x4_t acc0_s16 = vqmovn_s32(acc0);
2060           const int16x4_t acc1_s16 = vqmovn_s32(acc1);
2061           const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
2062           const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2063           vst1_u8(output_ptr, res_u8);
2064           output_ptr += 8;
2065         }
2066         // Handle 4 values at once. Now we're paying the full price of the
2067         // high latency of vqrdmulh. Also, storing only 4 bytes at the end
2068         // (without any alignment) can only be done 1 byte at a time.
2069         // Yet, that is still worth doing to minimize the amount of leftover
2070         // that will have to go through the very slow scalar code.
2071         for (; i <= num_output_values - 4; i += 4)
2072         {
2073           int32x4_t acc = vld1q_s32(acc_buffer + i);
2074           if (!shift_left)
2075           {
2076             // Fixed-point multiplication.
2077             acc = vqrdmulhq_n_s32(acc, output_multiplier);
2078             // Rounding right shift.
2079             acc = RoundingDivideByPOT(acc, -output_shift);
2080           }
2081           else
2082           {
2083             // Fixed-point multiplication.
2084             acc = vmulq_n_s32(acc, multiplier_power_of_two);
2085             acc = vqrdmulhq_n_s32(acc, output_multiplier);
2086           }
2087           // Add the output offset.
2088           acc = vaddq_s32(acc, output_offset_vec);
2089           // Apply the activation function.
2090           acc = vmaxq_s32(acc, output_activation_min_vec);
2091           acc = vminq_s32(acc, output_activation_max_vec);
2092           // Saturating cast to uint8_t and store to destination.
2093           const int16x4_t acc_s16 = vqmovn_s32(acc);
2094           const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
2095           const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2096           vst1_lane_u8(output_ptr + 0, res_u8, 0);
2097           vst1_lane_u8(output_ptr + 1, res_u8, 1);
2098           vst1_lane_u8(output_ptr + 2, res_u8, 2);
2099           vst1_lane_u8(output_ptr + 3, res_u8, 3);
2100           output_ptr += 4;
2101         }
2102 #endif // USE_NEON
2103
2104         // Handle leftover values, one by one. This is very slow.
2105         for (; i < num_output_values; i++)
2106         {
2107           int32_t acc = acc_buffer[i];
2108           acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
2109           acc += output_offset;
2110           acc = std::max(acc, output_activation_min);
2111           acc = std::min(acc, output_activation_max);
2112           *output_ptr++ = static_cast<uint8_t>(acc);
2113         }
2114       }
2115     }
2116   }
2117 }
2118
2119 } // namespace optimized
2120 } // namespace cker
2121 } // namespace nnfw
2122
2123 #endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__