compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
  19 #define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
  20
  21 #include "cker/Shape.h"
  22 #include "cker/Types.h"
  23 #include "cker/Utils.h"
  24 #include "cker/neon/neon_check.h"
  25
  26 #include <fixedpoint/fixedpoint.h>
  27 #include <public/gemmlowp.h>
  28
  29 namespace nnfw
  30 {
  31 namespace cker
  32 {
  33 namespace optimized
  34 {
  35 namespace depthwise_conv
  36 {
  37
  38 // Implementation of quantized DepthwiseConv
  39
  40 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
  41 struct QuantizedDepthwiseConvKernel
  42 {
  43 };
  44
  45 #ifdef USE_NEON
  46 template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
  47 {
  48   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
  49                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
  50                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
  51   {
  52     (void)input_depth;
  53     (void)depth_multiplier;
  54     // Load the filters, add filter_offset.
  55     uint8x8x2_t filter_u8;
  56     filter_u8.val[0] = vld1_u8(filter_ptr);
  57     filter_u8.val[1] = vld1_u8(filter_ptr + 8);
  58     int16x8_t filter[2];
  59     for (int i = 0; i < 2; i++)
  60     {
  61       filter[i] =
  62         vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
  63     }
  64     // Handle one output pixel at a time.
  65     for (int outp = 0; outp < num_output_pixels; outp++)
  66     {
  67       // Load the accumulators from acc_buffer
  68       int32x4x2_t acc[2];
  69       for (int i = 0; i < 2; i++)
  70       {
  71         acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
  72         acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
  73       }
  74       // Load the inputs, add input_offset.
  75       const uint8x8_t input_u8 = vld1_u8(input_ptr);
  76       input_ptr += input_ptr_increment;
  77       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
  78       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
  79       // Duplicate the input values, 2-fold
  80       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
  81       // Multiply-accumulate
  82       for (int i = 0; i < 2; i++)
  83       {
  84         acc[0].val[i] =
  85           vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
  86         acc[1].val[i] =
  87           vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
  88       }
  89       // Store the accumulators back to acc_buffer
  90       for (int i = 0; i < 2; i++)
  91       {
  92         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
  93         vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
  94       }
  95       acc_buffer_ptr += 16;
  96     }
  97   }
  98 };
  99
 100 template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
 101 {
 102   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 103                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 104                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 105   {
 106     (void)input_depth;
 107     (void)depth_multiplier;
 108     (void)input_ptr_increment;
 109     // Load the filters, add filter_offset.
 110     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
 111     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
 112     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 113
 114     int outp = 0;
 115     // Handle 2 output pixels at a time.
 116     for (; outp <= num_output_pixels - 2; outp += 2)
 117     {
 118       // Load the accumulators from acc_buffer.
 119       int32x4_t acc[4];
 120       for (int i = 0; i < 4; i++)
 121       {
 122         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 123       }
 124       // Load the inputs, add input_offset.
 125       uint8x8_t input_u8[2];
 126       for (int i = 0; i < 2; i++)
 127       {
 128         input_u8[i] = vld1_u8(input_ptr + 8 * i);
 129       }
 130       input_ptr += 16;
 131       int16x8_t input[2];
 132       for (int i = 0; i < 2; i++)
 133       {
 134         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
 135       }
 136       for (int i = 0; i < 2; i++)
 137       {
 138         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
 139       }
 140       // Multiply-accumulate.
 141       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
 142       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
 143       acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
 144       acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
 145       // Store the accumulators back to acc_buffer
 146       for (int i = 0; i < 4; i++)
 147       {
 148         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 149       }
 150       acc_buffer_ptr += 16;
 151     }
 152     // Handle 1 output pixel at a time.
 153     for (; outp < num_output_pixels; outp++)
 154     {
 155       // Load the accumulators from acc_buffer.
 156       int32x4_t acc[2];
 157       acc[0] = vld1q_s32(acc_buffer_ptr);
 158       acc[1] = vld1q_s32(acc_buffer_ptr + 4);
 159
 160       // Load the inputs, add input_offset.
 161       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 162       input_ptr += 8;
 163       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 164       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 165       // Multiply-accumulate.
 166       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
 167       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
 168       // Store the accumulators back to acc_buffer
 169       vst1q_s32(acc_buffer_ptr, acc[0]);
 170       vst1q_s32(acc_buffer_ptr + 4, acc[1]);
 171       acc_buffer_ptr += 8;
 172     }
 173   }
 174 };
 175
 176 template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
 177 {
 178   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 179                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 180                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 181   {
 182     (void)input_depth;
 183     (void)depth_multiplier;
 184     (void)input_ptr_increment;
 185     // Load the filters, add filter_offset.
 186     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
 187     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
 188     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 189
 190     int outp = 0;
 191     // Handle 2 output pixels at a time.
 192     for (; outp <= num_output_pixels - 2; outp += 2)
 193     {
 194       // Load the accumulators from acc_buffer
 195       int32x4_t acc[4];
 196       for (int i = 0; i < 4; i++)
 197       {
 198         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 199       }
 200       // Load the inputs, add input_offset.
 201       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 202       input_ptr += 8;
 203       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 204       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 205       // Duplicate the input values, 2-fold
 206       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
 207       // Multiply-accumulate
 208       for (int i = 0; i < 2; i++)
 209       {
 210         acc[2 * i + 0] =
 211           vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
 212         acc[2 * i + 1] =
 213           vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
 214       }
 215       // Store the accumulators back to acc_buffer
 216       for (int i = 0; i < 4; i++)
 217       {
 218         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 219       }
 220       acc_buffer_ptr += 16;
 221     }
 222     // Handle one output pixel at a time.
 223     for (; outp < num_output_pixels; outp++)
 224     {
 225       // Load the accumulators from acc_buffer
 226       int32x4_t acc[2];
 227       for (int i = 0; i < 2; i++)
 228       {
 229         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 230       }
 231       // Load the inputs, add input_offset.
 232       uint8x8_t input_u8 = vdup_n_u8(0);
 233       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 234       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 235       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 236       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 237       input_ptr += 4;
 238       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 239       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 240       // Duplicate the input values, 2-fold
 241       const int16x4x2_t input_dup2 = vzip_s16(input, input);
 242       // Multiply-accumulate
 243       acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
 244       acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
 245       // Store the accumulators back to acc_buffer
 246       for (int i = 0; i < 2; i++)
 247       {
 248         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 249       }
 250       acc_buffer_ptr += 8;
 251     }
 252   }
 253 };
 254
 255 template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
 256 {
 257   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 258                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 259                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 260   {
 261     (void)input_depth;
 262     (void)depth_multiplier;
 263     (void)input_ptr_increment;
 264     // Load the filters, add filter_offset.
 265     int16x8_t filter[2];
 266     for (int i = 0; i < 2; i++)
 267     {
 268       const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
 269       const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
 270       filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 271     }
 272     int outp = 0;
 273     // Handle two output pixels at a time.
 274     for (; outp <= num_output_pixels - 2; outp += 2)
 275     {
 276       // Load the accumulators from acc_buffer.
 277       int32x4_t acc[8];
 278       for (int i = 0; i < 8; i++)
 279       {
 280         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 281       }
 282       // Load the inputs, add input_offset.
 283       uint8x8_t input_u8 = vdup_n_u8(0);
 284       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 285       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 286       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 287       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 288       input_ptr += 4;
 289       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 290       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 291       // Multiply-accumulate.
 292       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
 293       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
 294       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
 295       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
 296       acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
 297       acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
 298       acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
 299       acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
 300       // Store the accumulators back to acc_buffer.
 301       for (int i = 0; i < 8; i++)
 302       {
 303         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 304       }
 305       acc_buffer_ptr += 32;
 306     }
 307     // Handle one output pixel at a time.
 308     for (; outp < num_output_pixels; outp++)
 309     {
 310       // Load the accumulators from acc_buffer.
 311       int32x4_t acc[4];
 312       for (int i = 0; i < 4; i++)
 313       {
 314         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 315       }
 316       // Load the inputs, add input_offset.
 317       uint8x8_t input_u8 = vdup_n_u8(0);
 318       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 319       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 320       input_ptr += 2;
 321       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 322       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 323
 324       // Multiply-accumulate.
 325       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
 326       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
 327       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
 328       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
 329
 330       // Store the accumulators back to acc_buffer.
 331       for (int i = 0; i < 4; i++)
 332       {
 333         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 334       }
 335       acc_buffer_ptr += 16;
 336     }
 337   }
 338 };
 339
 340 template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
 341 {
 342   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 343                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 344                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 345   {
 346     (void)input_depth;
 347     (void)depth_multiplier;
 348     (void)input_ptr_increment;
 349     // Load the filters, add filter_offset.
 350     uint8x8_t filter_u8 = vdup_n_u8(0);
 351     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 352     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 353     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
 354     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
 355     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 356     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 357
 358     int outp = 0;
 359     // Handle 4 output pixels at a time.
 360     for (; outp <= num_output_pixels - 4; outp += 4)
 361     {
 362       // Load the accumulators from acc_buffer
 363       int32x4_t acc[4];
 364       for (int i = 0; i < 4; i++)
 365       {
 366         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 367       }
 368
 369       // Load the inputs, add input_offset.
 370       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 371       input_ptr += 8;
 372       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 373       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 374       // Duplicate the input values, 2-fold
 375       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
 376       // Multiply-accumulate
 377       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
 378       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
 379       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
 380       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
 381       // Store the accumulators back to acc_buffer
 382       for (int i = 0; i < 4; i++)
 383       {
 384         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 385       }
 386       acc_buffer_ptr += 16;
 387     }
 388     // Handle one output pixel at a time.
 389     for (; outp < num_output_pixels; outp++)
 390     {
 391       // Load the accumulators from acc_buffer
 392       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
 393
 394       uint8x8_t input_u8 = vdup_n_u8(0);
 395       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 396       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 397       input_ptr += 2;
 398       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 399       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 400       // Duplicate the input values, 2-fold
 401       const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
 402       // Multiply-accumulate
 403       acc = vmlal_s16(acc, filter, input_dup2);
 404       // Store the accumulators back to acc_buffer
 405       vst1q_s32(acc_buffer_ptr, acc);
 406       acc_buffer_ptr += 4;
 407     }
 408   }
 409 };
 410
 411 template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
 412 {
 413   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 414                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 415                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 416   {
 417     (void)input_depth;
 418     (void)depth_multiplier;
 419     (void)input_ptr_increment;
 420     // Load the filters, add filter_offset.
 421     uint8x8_t filter_u8 = vdup_n_u8(0);
 422     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 423     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 424     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
 425     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
 426     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 427     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 428
 429     int outp = 0;
 430     // Handle 8 output pixels at a time.
 431     for (; outp <= num_output_pixels - 8; outp += 8)
 432     {
 433       // Load the accumulators from acc_buffer.
 434       int32x4_t acc[4];
 435       for (int i = 0; i < 4; i++)
 436       {
 437         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 438       }
 439       // Load the inputs, add input_offset.
 440       uint8x8_t input_u8[2];
 441       for (int i = 0; i < 2; i++)
 442       {
 443         input_u8[i] = vld1_u8(input_ptr + 8 * i);
 444       }
 445       input_ptr += 16;
 446       int16x8_t input[2];
 447       for (int i = 0; i < 2; i++)
 448       {
 449         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
 450       }
 451       for (int i = 0; i < 2; i++)
 452       {
 453         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
 454       }
 455
 456       // Multiply-accumulate.
 457       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
 458       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
 459       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
 460       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
 461       // Store the accumulators back to acc_buffer.
 462       for (int i = 0; i < 4; i++)
 463       {
 464         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 465       }
 466       acc_buffer_ptr += 16;
 467     }
 468     // Handle 4 output pixels at a time.
 469     for (; outp <= num_output_pixels - 4; outp += 4)
 470     {
 471       // Load the accumulators from acc_buffer.
 472       int32x4_t acc[2];
 473       for (int i = 0; i < 2; i++)
 474       {
 475         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 476       }
 477       // Load the inputs, add input_offset.
 478       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 479       input_ptr += 8;
 480       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 481       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 482
 483       // Multiply-accumulate.
 484       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
 485       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
 486       // Store the accumulators back to acc_buffer.
 487       for (int i = 0; i < 2; i++)
 488       {
 489         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 490       }
 491       acc_buffer_ptr += 8;
 492     }
 493     // Handle 2 output pixels at a time.
 494     for (; outp <= num_output_pixels - 2; outp += 2)
 495     {
 496       // Load the accumulators from acc_buffer.
 497       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
 498       // Load the inputs, add input_offset.
 499       uint8x8_t input_u8 = vdup_n_u8(0);
 500       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 501       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 502       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 503       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 504       input_ptr += 4;
 505       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 506       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 507
 508       // Multiply-accumulate.
 509       acc = vmlal_s16(acc, filter, input);
 510       // Store the accumulators back to acc_buffer.
 511       vst1q_s32(acc_buffer_ptr, acc);
 512       acc_buffer_ptr += 4;
 513     }
 514     // Handle 1 output pixel at a time.
 515     for (; outp < num_output_pixels; outp++)
 516     {
 517       // Load the accumulators from acc_buffer.
 518       int32x2_t acc = vld1_s32(acc_buffer_ptr);
 519       // Load the inputs, add input_offset.
 520       uint8x8_t input_u8 = vdup_n_u8(0);
 521       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 522       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 523       input_ptr += 2;
 524       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 525       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 526
 527       // Multiply-accumulate.
 528       acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
 529       // Store the accumulators back to acc_buffer.
 530       vst1_s32(acc_buffer_ptr, acc);
 531       acc_buffer_ptr += 2;
 532     }
 533   }
 534 };
 535
 536 template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
 537 {
 538   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 539                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 540                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 541   {
 542     (void)input_depth;
 543     (void)depth_multiplier;
 544     (void)input_ptr_increment;
 545     // Load the filters, add filter_offset.
 546     uint8x8_t filter_u8 = vdup_n_u8(0);
 547     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 548     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 549     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
 550     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
 551     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 552     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 553
 554     int outp = 0;
 555     // Handle 8 output pixels at a time.
 556     for (; outp <= num_output_pixels - 8; outp += 8)
 557     {
 558       // Load the accumulators from acc_buffer
 559       int32x4_t acc[4];
 560       for (int i = 0; i < 4; i++)
 561       {
 562         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 563       }
 564
 565       // Load the inputs, add input_offset.
 566       const uint8x8_t input_u8 = vld1_u8(input_ptr);
 567       input_ptr += 8;
 568       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 569       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 570       // Duplicate the input values, 2-fold
 571       const int16x8x2_t input_dup2 = vzipq_s16(input, input);
 572       // Multiply-accumulate
 573       acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
 574       acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
 575       acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
 576       acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
 577       // Store the accumulators back to acc_buffer
 578       for (int i = 0; i < 4; i++)
 579       {
 580         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 581       }
 582       acc_buffer_ptr += 16;
 583     }
 584     // Handle one output pixel at a time.
 585     for (; outp < num_output_pixels; outp++)
 586     {
 587       // Load the accumulators from acc_buffer
 588       int32x2_t acc = vld1_s32(acc_buffer_ptr);
 589
 590       // Load the inputs, add input_offset.
 591       const uint32_t input = *input_ptr++ + input_offset;
 592
 593       // Multiply-accumulate
 594       acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
 595       // Store the accumulators back to acc_buffer
 596       vst1_s32(acc_buffer_ptr, acc);
 597       acc_buffer_ptr += 2;
 598     }
 599   }
 600 };
 601
 602 template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
 603 {
 604   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 605                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 606                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 607   {
 608     (void)input_depth;
 609     (void)depth_multiplier;
 610     (void)input_ptr_increment;
 611     // Load the filters, add filter_offset.
 612     uint8x8_t filter_u8 = vdup_n_u8(0);
 613     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 614     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 615     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
 616     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
 617     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 618     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 619
 620     int outp = 0;
 621     // Handle 8 output pixels at a time.
 622     for (; outp <= num_output_pixels - 8; outp += 8)
 623     {
 624       // Load the accumulators from acc_buffer
 625       int32x4_t acc[8];
 626       for (int i = 0; i < 8; i++)
 627       {
 628         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 629       }
 630
 631       // Load the inputs, add input_offset.
 632       uint8x8_t input_u8 = vld1_u8(input_ptr);
 633       input_ptr += 8;
 634       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 635       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 636
 637       // Multiply-accumulate
 638       acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
 639       acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
 640       acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
 641       acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
 642       acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
 643       acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
 644       acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
 645       acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
 646
 647       // Store the accumulators back to acc_buffer
 648       for (int i = 0; i < 8; i++)
 649       {
 650         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 651       }
 652       acc_buffer_ptr += 32;
 653     }
 654     // Handle 4 output pixels at a time.
 655     for (; outp <= num_output_pixels - 4; outp += 4)
 656     {
 657       // Load the accumulators from acc_buffer
 658       int32x4_t acc[4];
 659       for (int i = 0; i < 4; i++)
 660       {
 661         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 662       }
 663
 664       // Load the inputs, add input_offset.
 665       uint8x8_t input_u8 = vdup_n_u8(0);
 666       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 667       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 668       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 669       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 670       input_ptr += 4;
 671       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 672       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 673
 674       // Multiply-accumulate
 675       acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
 676       acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
 677       acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
 678       acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
 679
 680       // Store the accumulators back to acc_buffer
 681       for (int i = 0; i < 4; i++)
 682       {
 683         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 684       }
 685       acc_buffer_ptr += 16;
 686     }
 687     // Handle one output pixel at a time.
 688     for (; outp < num_output_pixels; outp++)
 689     {
 690       // Load the accumulators from acc_buffer
 691       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
 692
 693       // Load the inputs, add input_offset.
 694       const uint32_t input = *input_ptr++ + input_offset;
 695
 696       // Multiply-accumulate
 697       acc = vmlal_n_s16(acc, filter, input);
 698       // Store the accumulators back to acc_buffer
 699       vst1q_s32(acc_buffer_ptr, acc);
 700       acc_buffer_ptr += 4;
 701     }
 702   }
 703 };
 704
 705 template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
 706 {
 707   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 708                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 709                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 710   {
 711     (void)input_depth;
 712     (void)depth_multiplier;
 713     (void)input_ptr_increment;
 714     // Load the filters, add filter_offset.
 715     uint8x8_t filter_u8 = vdup_n_u8(0);
 716     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
 717     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
 718     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
 719     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
 720     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
 721     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
 722
 723     int outp = 0;
 724     // Handle 4 output pixels at a time.
 725     for (; outp <= num_output_pixels - 4; outp += 4)
 726     {
 727       // Load the accumulators from acc_buffer
 728       int32x4_t acc[4];
 729       for (int i = 0; i < 4; i++)
 730       {
 731         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 732       }
 733       // Load the inputs, add input_offset.
 734       int16x8_t input[2];
 735       for (int i = 0; i < 2; i++)
 736       {
 737         const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
 738         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 739         input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 740       }
 741       input_ptr += 16;
 742       // Multiply-accumulate
 743       for (int i = 0; i < 2; i++)
 744       {
 745         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
 746         acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
 747       }
 748       // Store the accumulators back to acc_buffer
 749       for (int i = 0; i < 4; i++)
 750       {
 751         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 752       }
 753       acc_buffer_ptr += 16;
 754     }
 755     // Handle one output pixel at a time.
 756     for (; outp < num_output_pixels; outp++)
 757     {
 758       // Load the accumulators from acc_buffer
 759       int32x4_t acc;
 760       acc = vld1q_s32(acc_buffer_ptr);
 761
 762       // Load the inputs, add input_offset.
 763       uint8x8_t input_u8 = vdup_n_u8(0);
 764       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 765       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 766       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 767       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 768       input_ptr += 4;
 769       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 770       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 771       // Multiply-accumulate
 772       acc = vmlal_s16(acc, filter, input);
 773       // Store the accumulators back to acc_buffer
 774       vst1q_s32(acc_buffer_ptr, acc);
 775       acc_buffer_ptr += 4;
 776     }
 777   }
 778 };
 779
 780 template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
 781 {
 782   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 783                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 784                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 785   {
 786     (void)input_depth;
 787     (void)depth_multiplier;
 788     (void)input_ptr_increment;
 789     // Load the filters, add filter_offset.
 790     int16x8_t filter[2];
 791     for (int i = 0; i < 2; i++)
 792     {
 793       const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
 794       const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
 795       filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 796     }
 797
 798     int outp = 0;
 799     // Handle 2 output pixels at a time.
 800     for (; outp <= num_output_pixels - 2; outp += 2)
 801     {
 802       // Load the accumulators from acc_buffer
 803       int32x4_t acc[8];
 804       for (int i = 0; i < 8; i++)
 805       {
 806         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 807       }
 808
 809       // Load the inputs, add input_offset.
 810       uint8x8_t input_u8 = vld1_u8(input_ptr);
 811       input_ptr += 8;
 812       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 813       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 814
 815       // Multiply-accumulate
 816       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);
 817       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);
 818       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);
 819       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);
 820       acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);
 821       acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);
 822       acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);
 823       acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);
 824       // Store the accumulators back to acc_buffer
 825       for (int i = 0; i < 8; i++)
 826       {
 827         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 828       }
 829       acc_buffer_ptr += 32;
 830     }
 831     // Handle one output pixel at a time.
 832     for (; outp < num_output_pixels; outp++)
 833     {
 834       // Load the accumulators from acc_buffer
 835       int32x4_t acc[4];
 836       for (int i = 0; i < 4; i++)
 837       {
 838         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
 839       }
 840
 841       // Load the inputs, add input_offset.
 842       uint8x8_t input_u8 = vdup_n_u8(0);
 843       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
 844       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
 845       input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
 846       input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
 847       input_ptr += 4;
 848       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
 849       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
 850
 851       // Multiply-accumulate
 852       acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
 853       acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
 854       acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
 855       acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
 856       // Store the accumulators back to acc_buffer
 857       for (int i = 0; i < 4; i++)
 858       {
 859         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
 860       }
 861       acc_buffer_ptr += 16;
 862     }
 863   }
 864 };
 865
 866 template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
 867 {
 868   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 869                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 870                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 871   {
 872     (void)input_depth;
 873     (void)depth_multiplier;
 874     // We will have to duplicate bytes in a NEON register, 3-fold.
 875     // We will do that by register-level table-look-up using VTBL instructions.
 876     // Here we prepare the registers containing the table-lookup indices.
 877     static const uint8_t dup3_indices_array[3][8] = {
 878       {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
 879     uint8x8_t dup3_indices[3];
 880     for (int i = 0; i < 3; i++)
 881     {
 882       dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
 883     }
 884
 885     // Handle one output pixel at a time.
 886     for (int outp = 0; outp < num_output_pixels; outp++)
 887     {
 888       const uint8_t *local_filter_ptr = filter_ptr;
 889       const uint8_t *local_input_ptr = input_ptr;
 890       int ic = 0;
 891       // Handle 8 input channels at a time.
 892       for (; ic <= input_depth - 8; ic += 8)
 893       {
 894         // Load the filters, add filter_offset.
 895         int16x8_t filter[3];
 896         uint8x8x3_t filter_u8;
 897         filter_u8.val[0] = vld1_u8(local_filter_ptr);
 898         filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
 899         filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
 900         local_filter_ptr += 24;
 901         for (int i = 0; i < 3; i++)
 902         {
 903           const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
 904           filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 905         }
 906         // Load the inputs, duplicate 3-fold, add input_offset.
 907         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
 908         local_input_ptr += 8;
 909
 910         uint8x8_t input_u8_dup3[3];
 911         for (int i = 0; i < 3; i++)
 912         {
 913           input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
 914         }
 915         int16x8_t input_dup3[3];
 916         for (int i = 0; i < 3; i++)
 917         {
 918           const int16x8_t input_s16_dup3 = vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
 919           input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
 920         }
 921         // Load the accumulators from acc_buffer
 922         int32x4x3_t acc[2];
 923         for (int i = 0; i < 2; i++)
 924         {
 925           acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
 926           acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
 927           acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
 928         }
 929         // Multiply-accumulate
 930         for (int j = 0; j < 3; j++)
 931         {
 932           acc[0].val[j] =
 933             vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
 934           acc[1].val[j] =
 935             vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
 936         }
 937         // Store the accumulators back to acc_buffer
 938         for (int i = 0; i < 2; i++)
 939         {
 940           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
 941           vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
 942           vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
 943         }
 944         acc_buffer_ptr += 24;
 945       }
 946       // Handle one input channel at a time.
 947       for (; ic < input_depth; ic++)
 948       {
 949         const int16_t input_val = *local_input_ptr++ + input_offset;
 950         for (int i = 0; i < 3; i++)
 951         {
 952           const int16_t filter_val = local_filter_ptr[i] + filter_offset;
 953           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
 954         }
 955         local_filter_ptr += 3;
 956       }
 957       input_ptr += input_ptr_increment;
 958     }
 959   }
 960 };
 961
 962 template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
 963 {
 964   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 965                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
 966                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
 967   {
 968     (void)input_depth;
 969     (void)depth_multiplier;
 970     // Handle one output pixel at a time.
 971     for (int outp = 0; outp < num_output_pixels; outp++)
 972     {
 973       const uint8_t *local_filter_ptr = filter_ptr;
 974       const uint8_t *local_input_ptr = input_ptr;
 975       int ic = 0;
 976       // Handle 8 input channels at a time.
 977       for (; ic <= input_depth - 8; ic += 8)
 978       {
 979         // Load the filters, add filter_offset.
 980         int16x8_t filter[2];
 981         uint8x8x2_t filter_u8;
 982         filter_u8.val[0] = vld1_u8(local_filter_ptr);
 983         filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
 984         local_filter_ptr += 16;
 985         for (int i = 0; i < 2; i++)
 986         {
 987           const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
 988           filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
 989         }
 990         // Load the inputs, add input_offset, duplicate 2-fold.
 991         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
 992         local_input_ptr += 8;
 993         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
 994         const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
 995         const int16x8x2_t input_dup2 = vzipq_s16(input, input);
 996         // Load the accumulators from acc_buffer.
 997         int32x4x2_t acc[2];
 998         for (int i = 0; i < 2; i++)
 999         {
1000           acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
1001           acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
1002         }
1003         // Multiply-accumulate.
1004         for (int j = 0; j < 2; j++)
1005         {
1006           acc[0].val[j] =
1007             vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
1008           acc[1].val[j] =
1009             vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
1010         }
1011         // Store the accumulators back to acc_buffer.
1012         for (int i = 0; i < 2; i++)
1013         {
1014           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
1015           vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
1016         }
1017         acc_buffer_ptr += 16;
1018       }
1019       // Handle one input channel at a time.
1020       for (; ic < input_depth; ic++)
1021       {
1022         // Load the inputs.
1023         const int16_t input_val = *local_input_ptr++ + input_offset;
1024         for (int i = 0; i < 2; i++)
1025         {
1026           const int16_t filter_val = local_filter_ptr[i] + filter_offset;
1027           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1028         }
1029         local_filter_ptr += 2;
1030       }
1031       input_ptr += input_ptr_increment;
1032     }
1033   }
1034 };
1035
1036 template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
1037 {
1038   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1039                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1040                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1041   {
1042     (void)input_depth;
1043     (void)depth_multiplier;
1044     // Handle one output pixel at a time.
1045     for (int outp = 0; outp < num_output_pixels; outp++)
1046     {
1047       const uint8_t *local_filter_ptr = filter_ptr;
1048       const uint8_t *local_input_ptr = input_ptr;
1049       int ic = 0;
1050       // Handle 16 input channels at a time.
1051       for (; ic <= input_depth - 16; ic += 16)
1052       {
1053         // Load the filters, add filter_offset.
1054         uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
1055         uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
1056         local_filter_ptr += 16;
1057         int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1058         int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1059         filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1060         filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1061         // Load the inputs, add input_offset.
1062         uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
1063         uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
1064         local_input_ptr += 16;
1065         int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
1066         int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
1067         input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
1068         input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
1069         // Load the accumulators from acc_buffer
1070         int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1071         int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1072         int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1073         int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1074         acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
1075         acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
1076         acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
1077         acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
1078         // Store the accumulators back to acc_buffer
1079         vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1080         vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1081         vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1082         vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1083         acc_buffer_ptr += 16;
1084       }
1085       // Handle 8 input channels at a time.
1086       for (; ic <= input_depth - 8; ic += 8)
1087       {
1088         // Load the filters, add filter_offset.
1089         const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
1090         local_filter_ptr += 8;
1091         const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
1092         const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
1093         // Load the inputs, add input_offset.
1094         const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
1095         local_input_ptr += 8;
1096         const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1097         const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1098         // Load the accumulators from acc_buffer
1099         int32x4_t acc[2];
1100         for (int i = 0; i < 2; i++)
1101         {
1102           acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1103         }
1104         // Multiply-accumulate
1105         acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
1106         acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
1107         // Store the accumulators back to acc_buffer
1108         for (int i = 0; i < 2; i++)
1109         {
1110           vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1111         }
1112         acc_buffer_ptr += 8;
1113       }
1114       // Handle one input channel at a time.
1115       for (; ic < input_depth; ic++)
1116       {
1117         const int16_t input_val = *local_input_ptr++ + input_offset;
1118         const int16_t filter_val = *local_filter_ptr++ + filter_offset;
1119         *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1120       }
1121       input_ptr += input_ptr_increment;
1122     }
1123   }
1124 };
1125
1126 template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
1127 {
1128   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1129                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1130                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1131   {
1132     (void)input_depth;
1133     (void)depth_multiplier;
1134     // Load the filters, add filter_offset.
1135     uint8x8_t filter_u8[2];
1136     for (int i = 0; i < 2; i++)
1137     {
1138       filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
1139     }
1140     int16x8_t filter[2];
1141     for (int i = 0; i < 2; i++)
1142     {
1143       filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
1144     }
1145     for (int i = 0; i < 2; i++)
1146     {
1147       filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
1148     }
1149     // Handle one output pixel at a time.
1150     for (int outp = 0; outp < num_output_pixels; outp++)
1151     {
1152       // Load the inputs, add input_offset.
1153       uint8x8_t input_u8[2];
1154       for (int i = 0; i < 2; i++)
1155       {
1156         input_u8[i] = vld1_u8(input_ptr + 8 * i);
1157       }
1158       input_ptr += input_ptr_increment;
1159       int16x8_t input[2];
1160       for (int i = 0; i < 2; i++)
1161       {
1162         input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
1163       }
1164       for (int i = 0; i < 2; i++)
1165       {
1166         input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
1167       }
1168       // Load the accumulators from acc_buffer
1169       int32x4_t acc[4];
1170       for (int i = 0; i < 4; i++)
1171       {
1172         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1173       }
1174       // Multiply-accumulate
1175       for (int i = 0; i < 2; i++)
1176       {
1177         acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
1178         acc[2 * i + 1] =
1179           vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
1180       }
1181       // Store the accumulators back to acc_buffer
1182       for (int i = 0; i < 4; i++)
1183       {
1184         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1185       }
1186       acc_buffer_ptr += 16;
1187     }
1188   }
1189 };
1190
1191 template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
1192 {
1193   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1194                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1195                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1196   {
1197     (void)input_depth;
1198     (void)depth_multiplier;
1199     // Load the filters, add filter_offset.
1200     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
1201     const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
1202     const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
1203     // Handle one output pixel at a time.
1204     for (int outp = 0; outp < num_output_pixels; outp++)
1205     {
1206       // Load the inputs, add input_offset.
1207       const uint8x8_t input_u8 = vld1_u8(input_ptr);
1208       const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1209       const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1210       // Load the accumulators from acc_buffer
1211       int32x4_t acc[2];
1212       for (int i = 0; i < 2; i++)
1213       {
1214         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1215       }
1216       // Multiply-accumulate
1217       acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
1218       acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
1219       // Store the accumulators back to acc_buffer
1220       for (int i = 0; i < 2; i++)
1221       {
1222         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1223       }
1224       acc_buffer_ptr += 8;
1225       input_ptr += input_ptr_increment;
1226     }
1227   }
1228 };
1229
1230 template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
1231 {
1232   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1233                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1234                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1235   {
1236     (void)input_depth;
1237     (void)depth_multiplier;
1238     // Load the filters, add filter_offset.
1239     uint8x8_t filter_u8[2];
1240     for (int i = 0; i < 2; i++)
1241     {
1242       filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
1243     }
1244     int16x8_t filter[2];
1245     for (int i = 0; i < 2; i++)
1246     {
1247       filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
1248     }
1249     for (int i = 0; i < 2; i++)
1250     {
1251       filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
1252     }
1253     // Handle one output pixel at a time.
1254     for (int outp = 0; outp < num_output_pixels; outp++)
1255     {
1256       uint8_t input_u8 = *input_ptr;
1257       input_ptr += input_ptr_increment;
1258       int16_t input = static_cast<int16_t>(input_u8) + input_offset;
1259       // Load the accumulators from acc_buffer
1260       int32x4_t acc[4];
1261       for (int i = 0; i < 4; i++)
1262       {
1263         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1264       }
1265       // Multiply-accumulate
1266       for (int i = 0; i < 2; i++)
1267       {
1268         acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
1269         acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
1270       }
1271       // Store the accumulators back to acc_buffer
1272       for (int i = 0; i < 4; i++)
1273       {
1274         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1275       }
1276       acc_buffer_ptr += 16;
1277     }
1278   }
1279 };
1280
1281 template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
1282 {
1283   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1284                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1285                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1286   {
1287     (void)input_depth;
1288     (void)depth_multiplier;
1289     // Load the filters, add filter_offset.
1290     uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
1291     uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
1292     uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
1293     uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
1294     int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1295     int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1296     int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
1297     int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
1298     filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1299     filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1300     filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
1301     filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
1302     // Handle one output pixel at a time.
1303     for (int outp = 0; outp < num_output_pixels; outp++)
1304     {
1305       uint8_t input_u8 = *input_ptr;
1306       input_ptr += input_ptr_increment;
1307       int16_t input = static_cast<int16_t>(input_u8) + input_offset;
1308       // Load the accumulators from acc_buffer
1309       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1310       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1311       int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1312       int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1313       int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
1314       int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
1315       int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
1316       int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
1317       // Multiply-accumulate
1318       acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
1319       acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
1320       acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
1321       acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
1322       acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
1323       acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
1324       acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
1325       acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
1326       // Store the accumulators back to acc_buffer
1327       vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1328       vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1329       vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1330       vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1331       vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
1332       vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
1333       vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
1334       vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
1335       acc_buffer_ptr += 32;
1336     }
1337   }
1338 };
1339
1340 template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
1341 {
1342   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1343                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1344                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1345   {
1346     (void)input_depth;
1347     (void)depth_multiplier;
1348     // Load the filters, add filter_offset.
1349     // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
1350     // We load the first 16 bytes into filter_u8_{0,1} as usual.
1351     // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
1352     // This is redundant: the first 4 bytes of filter_u8_x are the same
1353     // as the last 4 bytes of filter_u8_x.
1354     uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
1355     uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
1356     uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
1357     int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1358     int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1359     int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
1360     filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1361     filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1362     filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
1363     // Handle one output pixel at a time.
1364     for (int outp = 0; outp < num_output_pixels; outp++)
1365     {
1366       uint8_t input_u8 = *input_ptr;
1367       input_ptr += input_ptr_increment;
1368       int16_t input = static_cast<int16_t>(input_u8) + input_offset;
1369       // Load the accumulators from acc_buffer
1370       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1371       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1372       int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1373       int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1374       int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
1375       // Multiply-accumulate
1376       acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
1377       acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
1378       acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
1379       acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
1380       acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
1381       // Store the accumulators back to acc_buffer
1382       vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1383       vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1384       vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1385       vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1386       vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
1387       acc_buffer_ptr += 20;
1388     }
1389   }
1390 };
1391
1392 template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
1393 {
1394   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1395                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1396                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1397   {
1398     (void)input_depth;
1399     (void)depth_multiplier;
1400     // Load the filters, add filter_offset.
1401     const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
1402     const int16x8_t filter =
1403       vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
1404     // Handle one output pixel at a time.
1405     for (int outp = 0; outp < num_output_pixels; outp++)
1406     {
1407       uint8_t input_u8 = *input_ptr;
1408       input_ptr += input_ptr_increment;
1409       int16_t input = static_cast<int16_t>(input_u8) + input_offset;
1410       // Load the accumulators from acc_buffer
1411       int32x4_t acc[2];
1412       for (int i = 0; i < 2; i++)
1413       {
1414         acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1415       }
1416       // Multiply-accumulate
1417       acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
1418       acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
1419       // Store the accumulators back to acc_buffer
1420       for (int i = 0; i < 2; i++)
1421       {
1422         vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1423       }
1424       acc_buffer_ptr += 8;
1425     }
1426   }
1427 };
1428
1429 template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
1430 {
1431   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1432                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1433                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1434   {
1435     (void)input_depth;
1436     (void)depth_multiplier;
1437     // Load the filters, add filter_offset.
1438     uint8x8_t filter_u8 = vdup_n_u8(0);
1439     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
1440     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
1441     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
1442     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
1443     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
1444     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
1445
1446     int outp = 0;
1447
1448     // Handle 2 output pixels at a time.
1449     for (; outp <= num_output_pixels - 2; outp += 2)
1450     {
1451       // Load the accumulators from acc_buffer.
1452       int32x4_t acc = vld1q_s32(acc_buffer_ptr);
1453       // Load the inputs, add input_offset.
1454       uint16x4_t input_u16 = vdup_n_u16(0);
1455       input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 0);
1456       input_ptr += input_ptr_increment;
1457       input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);
1458       input_ptr += input_ptr_increment;
1459       const int16x4_t input_s16 =
1460         vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
1461       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1462
1463       // Multiply-accumulate.
1464       acc = vmlal_s16(acc, filter, input);
1465       // Store the accumulators back to acc_buffer.
1466       vst1q_s32(acc_buffer_ptr, acc);
1467       acc_buffer_ptr += 4;
1468     }
1469
1470     // Handle 1 output pixel at a time.
1471     for (; outp < num_output_pixels; outp++)
1472     {
1473       // Load the accumulators from acc_buffer.
1474       int32x2_t acc = vld1_s32(acc_buffer_ptr);
1475       // Load the inputs, add input_offset.
1476       uint8x8_t input_u8 = vdup_n_u8(0);
1477       input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
1478       input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
1479       input_ptr += input_ptr_increment;
1480       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1481       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1482
1483       // Multiply-accumulate.
1484       acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
1485       // Store the accumulators back to acc_buffer.
1486       vst1_s32(acc_buffer_ptr, acc);
1487       acc_buffer_ptr += 2;
1488     }
1489   }
1490 };
1491
1492 template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
1493 {
1494   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1495                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1496                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1497   {
1498     (void)input_depth;
1499     (void)depth_multiplier;
1500     if (num_output_pixels <= 0)
1501     {
1502       return;
1503     }
1504
1505     // Load the filters, add filter_offset.
1506     uint8x8_t filter_u8 = vdup_n_u8(0);
1507     filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
1508     filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
1509     filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
1510     filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
1511     const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
1512     const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
1513
1514     int outp = 0;
1515
1516     // Handle one output pixel at a time until second to the last pixel. Second
1517     // to the last because we read eight input pixels while only processing
1518     // four.
1519     for (; outp < num_output_pixels - 1; outp++)
1520     {
1521       // Load the accumulators from acc_buffer
1522       int32x4_t acc;
1523       acc = vld1q_s32(acc_buffer_ptr);
1524
1525       // Load the inputs, add input_offset.
1526       uint8x8_t input_u8 = vld1_u8(input_ptr);
1527       input_ptr += input_ptr_increment;
1528       const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1529       const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1530       // Multiply-accumulate
1531       acc = vmlal_s16(acc, filter, input);
1532       // Store the accumulators back to acc_buffer
1533       vst1q_s32(acc_buffer_ptr, acc);
1534       acc_buffer_ptr += 4;
1535     }
1536
1537     // Handle the last output pixel.
1538     // Load the accumulators from acc_buffer
1539     int32x4_t acc;
1540     acc = vld1q_s32(acc_buffer_ptr);
1541
1542     // Load the inputs, add input_offset.
1543     uint8x8_t input_u8 = vdup_n_u8(0);
1544     input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
1545     input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
1546     input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
1547     input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
1548     const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1549     const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1550     // Multiply-accumulate
1551     acc = vmlal_s16(acc, filter, input);
1552     // Store the accumulators back to acc_buffer
1553     vst1q_s32(acc_buffer_ptr, acc);
1554   }
1555 };
1556
1557 template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
1558 {
1559   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1560                   const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1561                   const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1562   {
1563     (void)input_depth;
1564     (void)depth_multiplier;
1565     // Load the filters, add filter_offset.
1566     uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
1567     uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
1568     int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1569     int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1570     filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
1571     filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
1572     int16x4_t filter_0 = vget_low_s16(filter_s16_0);
1573     int16x4_t filter_1 = vget_high_s16(filter_s16_0);
1574     int16x4_t filter_2 = vget_high_s16(filter_s16_1);
1575
1576     // Handle one output pixel at a time.
1577     for (int outp = 0; outp < num_output_pixels; outp++)
1578     {
1579       // Load the inputs, add input_offset.
1580       uint8x8_t input_u8_0 = vld1_u8(input_ptr);
1581       uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
1582       input_ptr += input_ptr_increment;
1583       int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
1584       int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
1585       input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
1586       input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
1587
1588       // Load the accumulators from acc_buffer
1589       int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1590       int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1591       int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1592
1593       // Multiply-accumulate
1594       acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
1595       acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
1596       acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
1597
1598       // Store the accumulators back to acc_buffer
1599       vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1600       vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1601       vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1602
1603       acc_buffer_ptr += 12;
1604     }
1605   }
1606 };
1607 #endif
1608
1609 // Accumulates the effect of one row of the filter, on a segment of one row
1610 // of the output, accessing the corresponding one row of the input.
1611 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
1612 void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,
1613                                     int input_width, const uint8_t *input_data,
1614                                     int16_t input_offset, int pad_width, int depth_multiplier,
1615                                     int filter_width, const uint8_t *filter_data,
1616                                     int16_t filter_offset, int out_x_buffer_start,
1617                                     int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
1618 {
1619   // Sanity check parameters. This is important in particular to ensure
1620   // that we keep the number of template instantiations minimal, so we don't
1621   // increase binary size unnecessarily.
1622   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
1623   static_assert(kFixedInputDepth || kAllowStrided, "");
1624   assert(stride == 1 || kAllowStrided);
1625   if (kFixedInputDepth)
1626   {
1627     assert(input_depth == kFixedInputDepth);
1628   }
1629   if (kFixedDepthMultiplier)
1630   {
1631     assert(depth_multiplier == kFixedDepthMultiplier);
1632   }
1633   assert(output_depth == input_depth * depth_multiplier);
1634   const int input_ptr_increment = stride * input_depth;
1635   const uint8_t *filter_base_ptr = filter_data;
1636   for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1637   {
1638     // For the current (filter_x, filter_y) point in the filter,
1639     // compute the boundaries of the corresponding output row segment.
1640     int out_x_loop_start_unclampled = 0;
1641     int out_x_loop_end_unclampled = 0;
1642     if (kAllowStrided)
1643     {
1644       if (stride == 2)
1645       {
1646         out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 1) / 2;
1647         out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
1648       }
1649       else if (stride == 4)
1650       {
1651         out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 3) / 4;
1652         out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
1653       }
1654       else
1655       {
1656         out_x_loop_start_unclampled =
1657           (pad_width - dilation_factor * filter_x + stride - 1) / stride;
1658         out_x_loop_end_unclampled =
1659           (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
1660       }
1661     }
1662     else
1663     {
1664       out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
1665       out_x_loop_end_unclampled = pad_width + input_width - dilation_factor * filter_x;
1666     }
1667     // The kernel will have to iterate on the segment of the
1668     // output row that starts at out_x_loop_start and out_x_loop_end.
1669     const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled);
1670     const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled);
1671
1672     int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1673     const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1674     const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1675     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
1676     QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
1677       num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
1678       input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
1679     filter_base_ptr += output_depth;
1680   }
1681 }
1682
1683 // generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
1684 inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
1685                                                   int input_width, const uint8_t *input_data,
1686                                                   int16_t input_offset, int pad_width,
1687                                                   int depth_multiplier, int filter_width,
1688                                                   const uint8_t *filter_data, int16_t filter_offset,
1689                                                   int out_x_buffer_start, int out_x_buffer_end,
1690                                                   int output_depth, int32_t *acc_buffer)
1691 {
1692   const uint8_t *filter_base_ptr = filter_data;
1693   for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1694   {
1695     const int out_x_loop_start =
1696       std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
1697     const int out_x_loop_end =
1698       std::min(out_x_buffer_end,
1699                (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
1700
1701     int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1702     const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1703     const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1704     const int input_ptr_increment = (stride - 1) * input_depth;
1705     for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
1706     {
1707       const uint8_t *filter_ptr = filter_base_ptr;
1708       for (int ic = 0; ic < input_depth; ++ic)
1709       {
1710         const int16_t input_val = *input_ptr++ + input_offset;
1711         for (int m = 0; m < depth_multiplier; m++)
1712         {
1713           const int16_t filter_val = *filter_ptr++ + filter_offset;
1714           *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1715         }
1716       }
1717       input_ptr += input_ptr_increment;
1718     }
1719     filter_base_ptr += output_depth;
1720   }
1721 }
1722
1723 // Initializes the accumulator buffer with bias values.
1724 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
1725                                        const int32_t *bias_data, int32_t *acc_buffer)
1726 {
1727   int i = 0;
1728 #ifdef USE_NEON
1729   if (output_depth == 1)
1730   {
1731     const int32x4_t b = vdupq_n_s32(bias_data[0]);
1732     for (; i <= num_output_pixels - 16; i += 16)
1733     {
1734       vst1q_s32(acc_buffer + i + 0, b);
1735       vst1q_s32(acc_buffer + i + 4, b);
1736       vst1q_s32(acc_buffer + i + 8, b);
1737       vst1q_s32(acc_buffer + i + 12, b);
1738     }
1739     for (; i <= num_output_pixels - 4; i += 4)
1740     {
1741       vst1q_s32(acc_buffer + i, b);
1742     }
1743   }
1744   else if (output_depth == 2)
1745   {
1746     int32x4_t b = vdupq_n_s32(bias_data[0]);
1747     b = vsetq_lane_s32(bias_data[1], b, 1);
1748     b = vsetq_lane_s32(bias_data[1], b, 3);
1749     for (; i <= num_output_pixels - 8; i += 8)
1750     {
1751       vst1q_s32(acc_buffer + 2 * i + 0, b);
1752       vst1q_s32(acc_buffer + 2 * i + 4, b);
1753       vst1q_s32(acc_buffer + 2 * i + 8, b);
1754       vst1q_s32(acc_buffer + 2 * i + 12, b);
1755     }
1756     for (; i <= num_output_pixels - 2; i += 2)
1757     {
1758       vst1q_s32(acc_buffer + 2 * i, b);
1759     }
1760   }
1761   else if (output_depth == 4)
1762   {
1763     const int32x4_t b = vld1q_s32(bias_data);
1764     for (; i <= num_output_pixels - 4; i += 4)
1765     {
1766       vst1q_s32(acc_buffer + 4 * i + 0, b);
1767       vst1q_s32(acc_buffer + 4 * i + 4, b);
1768       vst1q_s32(acc_buffer + 4 * i + 8, b);
1769       vst1q_s32(acc_buffer + 4 * i + 12, b);
1770     }
1771     for (; i < num_output_pixels; i++)
1772     {
1773       vst1q_s32(acc_buffer + 4 * i, b);
1774     }
1775   }
1776   else if (output_depth == 8)
1777   {
1778     const int32x4_t b0 = vld1q_s32(bias_data);
1779     const int32x4_t b1 = vld1q_s32(bias_data + 4);
1780     for (; i <= num_output_pixels - 2; i += 2)
1781     {
1782       vst1q_s32(acc_buffer + 8 * i + 0, b0);
1783       vst1q_s32(acc_buffer + 8 * i + 4, b1);
1784       vst1q_s32(acc_buffer + 8 * i + 8, b0);
1785       vst1q_s32(acc_buffer + 8 * i + 12, b1);
1786     }
1787     for (; i < num_output_pixels; i++)
1788     {
1789       vst1q_s32(acc_buffer + 8 * i + 0, b0);
1790       vst1q_s32(acc_buffer + 8 * i + 4, b1);
1791     }
1792   }
1793   else if (output_depth == 16)
1794   {
1795     const int32x4_t b0 = vld1q_s32(bias_data);
1796     const int32x4_t b1 = vld1q_s32(bias_data + 4);
1797     const int32x4_t b2 = vld1q_s32(bias_data + 8);
1798     const int32x4_t b3 = vld1q_s32(bias_data + 12);
1799     for (; i < num_output_pixels; i++)
1800     {
1801       vst1q_s32(acc_buffer + 16 * i + 0, b0);
1802       vst1q_s32(acc_buffer + 16 * i + 4, b1);
1803       vst1q_s32(acc_buffer + 16 * i + 8, b2);
1804       vst1q_s32(acc_buffer + 16 * i + 12, b3);
1805     }
1806   }
1807 #endif
1808   for (; i < num_output_pixels; i++)
1809   {
1810     memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
1811   }
1812 }
1813
1814 inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape &input_shape,
1815                                  const uint8_t *input_data, const Shape &filter_shape,
1816                                  const uint8_t *filter_data, const Shape &bias_shape,
1817                                  const int32_t *bias_data, const Shape &output_shape,
1818                                  uint8_t *output_data, int thread_start, int thread_end,
1819                                  int thread_dim)
1820 {
1821   (void)bias_shape;
1822   const int stride_width = params.stride_width;
1823   const int stride_height = params.stride_height;
1824   const int pad_width = params.padding_values.width;
1825   const int pad_height = params.padding_values.height;
1826   const int depth_multiplier = params.depth_multiplier;
1827   const int32_t output_activation_min = params.quantized_activation_min;
1828   const int32_t output_activation_max = params.quantized_activation_max;
1829   const int32_t input_offset = params.input_offset;
1830   const int32_t filter_offset = params.weights_offset;
1831   const int32_t output_offset = params.output_offset;
1832   const int32_t output_multiplier = params.output_multiplier;
1833   const int output_shift = params.output_shift;
1834   const int dilation_width_factor = params.dilation_width_factor;
1835   const int dilation_height_factor = params.dilation_height_factor;
1836   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
1837   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1838   const int input_height = input_shape.Dims(1);
1839   const int input_width = input_shape.Dims(2);
1840   const int input_depth = input_shape.Dims(3);
1841   const int filter_height = filter_shape.Dims(1);
1842   const int filter_width = filter_shape.Dims(2);
1843   const int output_height = output_shape.Dims(1);
1844   const int output_width = output_shape.Dims(2);
1845 #ifdef USE_NEON
1846   const bool shift_left = (output_shift > 0);
1847   const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
1848 #endif
1849
1850   static const int kAccBufferMaxSize = 2048;
1851   int32_t acc_buffer[kAccBufferMaxSize];
1852   assert(kAccBufferMaxSize >= output_depth);
1853   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1854   const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1855   assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1856   assert(kAccBufferActualSize <= kAccBufferMaxSize);
1857   assert(kOutputPixelsInAccBuffer >= 1);
1858   assert(thread_dim == 0 || thread_dim == 1);
1859
1860   UNUSED_RELEASE(kAccBufferActualSize);
1861
1862   // row_accum_func will point to the core accumulation function to be used
1863   // for this DepthwiseConv op.
1864   using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
1865   row_accum_func_t row_accum_func = nullptr;
1866
1867 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1868   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
1869       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
1870       depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
1871   {                                                                                               \
1872     row_accum_func =                                                                              \
1873       QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \
1874   }
1875
1876 #ifdef USE_NEON
1877   // We go over our list of kernels by decreasing order of preference
1878   // for the cases where multiple kernels could apply.
1879
1880   // Start with the fastest kernels: AllowStrided=false, fixed input depth.
1881
1882   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
1883   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
1884   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
1885   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
1886   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
1887   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
1888   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
1889   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
1890   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
1891   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
1892
1893   // Next come the strided kernels: AllowStrided=true, fixed input depth.
1894   // They are a bit less efficient, but allow stride!=1.
1895
1896   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
1897   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
1898   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
1899   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
1900   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
1901   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
1902   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
1903   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
1904   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
1905
1906   // Finally, the kernels allowing a variable input depth,
1907   // these are the least efficient but most general kernels.
1908
1909   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
1910   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
1911   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
1912 #endif // USE_NEON
1913
1914   // No matching fast kernel found, use slow fallback.
1915   if (!row_accum_func)
1916   {
1917     row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
1918   }
1919
1920 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
1921
1922   const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
1923   const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1924   const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1925
1926   // Now that we have determined row_accum_func, we can start work.
1927   int batch_start = 0;
1928   int batch_end = batches;
1929   int row_start = 0;
1930   int row_end = output_height;
1931   int output_ptr_offset = 0;
1932
1933   switch (thread_dim)
1934   {
1935     case 0:
1936       // Multithread along with the batch axis
1937       assert(thread_start >= 0);
1938       assert(thread_end <= batches);
1939       batch_start = thread_start;
1940       batch_end = thread_end;
1941       output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
1942       break;
1943     case 1:
1944       // Multithread along with the row axis
1945       assert(thread_start >= 0);
1946       assert(thread_end <= output_height);
1947       row_start = thread_start;
1948       row_end = thread_end;
1949       output_ptr_offset = row_start * output_width * output_depth;
1950       break;
1951   }
1952
1953   uint8_t *output_ptr = output_data + output_ptr_offset;
1954   int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
1955   for (int b = batch_start; b < batch_end; ++b)
1956   {
1957     for (int out_y = row_start; out_y < row_end; ++out_y)
1958     {
1959       const int in_y_origin = (out_y * stride_height) - pad_height;
1960       const int filter_y_start =
1961         std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1962       const int filter_y_end =
1963         std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1964                                   dilation_height_factor);
1965       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1966            out_x_buffer_start += kOutputPixelsInAccBuffer)
1967       {
1968         const int out_x_buffer_end =
1969           std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1970         // We call a 'pixel' a group of activation that share all but the
1971         // 'depth'/'channel' coordinate. num_output_pixels is the number of
1972         // output pixels that we will accumulate in this loop iteration.
1973         const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1974         // Initialize our local accumulator with the bias values, so we don't
1975         // have to add them later.
1976         DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1977         // Accumulation loop. Most of the time should be spent in here.
1978         for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1979         {
1980           const int in_y = in_y_origin + dilation_height_factor * filter_y;
1981           row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1982                          input_data + in_y * input_height_stride + b * input_batch_stride,
1983                          input_offset, pad_width, depth_multiplier, filter_width,
1984                          filter_data + filter_y * filter_height_stride, filter_offset,
1985                          out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
1986         }
1987         // Finished accumulating int32_t values. Now need to convert them to
1988         // the final 8bit form and store them.
1989         const int num_output_values = output_depth * num_output_pixels;
1990         int i = 0;
1991 #ifdef USE_NEON
1992         using gemmlowp::RoundingDivideByPOT;
1993         const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
1994         const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
1995         const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
1996         // Handle 16 values at once.
1997         // This allows us to issue 4 mutually independent int32
1998         // multiplications (vqrdmulh), which should alleviate most of their
1999         // high latency.
2000         for (; i <= num_output_values - 16; i += 16)
2001         {
2002           int32x4_t acc[4];
2003           for (int j = 0; j < 4; j++)
2004           {
2005             acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
2006           }
2007
2008           if (!shift_left)
2009           {
2010             // Fixed-point multiplication.
2011             for (int j = 0; j < 4; j++)
2012             {
2013               acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
2014             }
2015             for (int j = 0; j < 4; j++)
2016             {
2017               acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
2018             }
2019           }
2020           else
2021           {
2022             // Fixed-point multiplication.
2023             for (int j = 0; j < 4; j++)
2024             {
2025               acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
2026               acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
2027             }
2028           }
2029           // Add the output offset.
2030           for (int j = 0; j < 4; j++)
2031           {
2032             acc[j] = vaddq_s32(acc[j], output_offset_vec);
2033           }
2034           // Apply the activation function.
2035           for (int j = 0; j < 4; j++)
2036           {
2037             acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
2038           }
2039           for (int j = 0; j < 4; j++)
2040           {
2041             acc[j] = vminq_s32(acc[j], output_activation_max_vec);
2042           }
2043           // Saturating cast to uint8_t and store to destination.
2044           int16x4_t acc_s16[4];
2045           for (int j = 0; j < 4; j++)
2046           {
2047             acc_s16[j] = vqmovn_s32(acc[j]);
2048           }
2049           const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
2050           const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
2051           const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
2052           const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
2053           vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
2054           output_ptr += 16;
2055         }
2056         // Handle 8 values at once.
2057         // Not as good as 16 (now we're only issuing 2 mutually independent
2058         // vqrdmulh instructions, so we're probably paying for their high
2059         // latency).
2060         for (; i <= num_output_values - 8; i += 8)
2061         {
2062           int32x4_t acc0 = vld1q_s32(acc_buffer + i);
2063           int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
2064           if (!shift_left)
2065           {
2066             // Fixed-point multiplication.
2067             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2068             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2069             // Rounding right shift.
2070             acc0 = RoundingDivideByPOT(acc0, -output_shift);
2071             acc1 = RoundingDivideByPOT(acc1, -output_shift);
2072           }
2073           else
2074           {
2075             // Fixed-point multiplication.
2076             acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
2077             acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2078
2079             acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
2080             acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2081           }
2082           // Add the output offset.
2083           acc0 = vaddq_s32(acc0, output_offset_vec);
2084           acc1 = vaddq_s32(acc1, output_offset_vec);
2085           // Apply the activation function.
2086           acc0 = vmaxq_s32(acc0, output_activation_min_vec);
2087           acc1 = vmaxq_s32(acc1, output_activation_min_vec);
2088           acc0 = vminq_s32(acc0, output_activation_max_vec);
2089           acc1 = vminq_s32(acc1, output_activation_max_vec);
2090           // Saturating cast to uint8_t and store to destination.
2091           const int16x4_t acc0_s16 = vqmovn_s32(acc0);
2092           const int16x4_t acc1_s16 = vqmovn_s32(acc1);
2093           const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
2094           const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2095           vst1_u8(output_ptr, res_u8);
2096           output_ptr += 8;
2097         }
2098         // Handle 4 values at once. Now we're paying the full price of the
2099         // high latency of vqrdmulh. Also, storing only 4 bytes at the end
2100         // (without any alignment) can only be done 1 byte at a time.
2101         // Yet, that is still worth doing to minimize the amount of leftover
2102         // that will have to go through the very slow scalar code.
2103         for (; i <= num_output_values - 4; i += 4)
2104         {
2105           int32x4_t acc = vld1q_s32(acc_buffer + i);
2106           if (!shift_left)
2107           {
2108             // Fixed-point multiplication.
2109             acc = vqrdmulhq_n_s32(acc, output_multiplier);
2110             // Rounding right shift.
2111             acc = RoundingDivideByPOT(acc, -output_shift);
2112           }
2113           else
2114           {
2115             // Fixed-point multiplication.
2116             acc = vmulq_n_s32(acc, multiplier_power_of_two);
2117             acc = vqrdmulhq_n_s32(acc, output_multiplier);
2118           }
2119           // Add the output offset.
2120           acc = vaddq_s32(acc, output_offset_vec);
2121           // Apply the activation function.
2122           acc = vmaxq_s32(acc, output_activation_min_vec);
2123           acc = vminq_s32(acc, output_activation_max_vec);
2124           // Saturating cast to uint8_t and store to destination.
2125           const int16x4_t acc_s16 = vqmovn_s32(acc);
2126           const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
2127           const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2128           vst1_lane_u8(output_ptr + 0, res_u8, 0);
2129           vst1_lane_u8(output_ptr + 1, res_u8, 1);
2130           vst1_lane_u8(output_ptr + 2, res_u8, 2);
2131           vst1_lane_u8(output_ptr + 3, res_u8, 3);
2132           output_ptr += 4;
2133         }
2134 #endif // USE_NEON
2135
2136         // Handle leftover values, one by one. This is very slow.
2137         for (; i < num_output_values; i++)
2138         {
2139           int32_t acc = acc_buffer[i];
2140           acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
2141           acc += output_offset;
2142           acc = std::max(acc, output_activation_min);
2143           acc = std::min(acc, output_activation_max);
2144           *output_ptr++ = static_cast<uint8_t>(acc);
2145         }
2146       }
2147     }
2148     output_ptr += batch_step;
2149   }
2150 }
2151
2152 } // namespace depthwise_conv
2153
2154 // template <DepthwiseConvOutputRounding kOutputRounding>
2155 inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape,
2156                                       const uint8_t *input_data, const Shape &filter_shape,
2157                                       const uint8_t *filter_data, const Shape &bias_shape,
2158                                       const int32_t *bias_data, const Shape &output_shape,
2159                                       uint8_t *output_data, int thread_start, int thread_end,
2160                                       int thread_dim)
2161 {
2162   const int depth_multiplier = params.depth_multiplier;
2163   const int32_t output_activation_min = params.quantized_activation_min;
2164   const int32_t output_activation_max = params.quantized_activation_max;
2165   const int dilation_width_factor = params.dilation_width_factor;
2166   const int dilation_height_factor = params.dilation_height_factor;
2167   assert(dilation_width_factor >= 1);
2168   assert(dilation_height_factor >= 1);
2169   assert(input_shape.DimensionsCount() == 4);
2170   assert(filter_shape.DimensionsCount() == 4);
2171   assert(output_shape.DimensionsCount() == 4);
2172   assert(output_activation_min <= output_activation_max);
2173   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
2174   const int input_depth = input_shape.Dims(3);
2175   assert(output_depth == input_depth * depth_multiplier);
2176   assert(bias_shape.FlatSize() == output_depth);
2177
2178   UNUSED_RELEASE(depth_multiplier);
2179   UNUSED_RELEASE(output_activation_min);
2180   UNUSED_RELEASE(output_activation_max);
2181   UNUSED_RELEASE(dilation_width_factor);
2182   UNUSED_RELEASE(dilation_height_factor);
2183   UNUSED_RELEASE(output_depth);
2184   UNUSED_RELEASE(input_depth);
2185
2186 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
2187 // Jetson TX-2. This compiler does not support the offsetof() macro.
2188 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
2189 //  TODO Use below codes
2190 //  // Dispatch to dot-product 3x3 kernels when supported.
2191 //
2192 //  ruy::Context *ruy_context = cpu_backend_context->ruy_context();
2193 //  const bool has_dot_product_instructions =
2194 //      ruy_context != nullptr &&
2195 //      (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
2196 //  if (has_dot_product_instructions)
2197 //  {
2198 //    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
2199 //    DotProduct3x3KernelType kernel_type =
2200 //    optimized_ops::depthwise_conv::CategorizeDotProductKernel(
2201 //        input_shape, filter_shape, params);
2202 //    if (kernel_type != DotProduct3x3KernelType::kNone)
2203 //    {
2204 //      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
2205 //          DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data,
2206 //                                                              filter_shape, filter_data,
2207 //                                                              bias_shape,
2208 //                                                              bias_data, output_shape,
2209 //                                                              output_data);
2210 //      return;
2211 //    }
2212 //  }
2213 //
2214 //  // Dispatch to non-dot-product 3x3 kernels when supported.
2215 //
2216 //  const int stride_width = params.stride_width;
2217 //  const int stride_height = params.stride_height;
2218 //  const int pad_width = params.padding_values.width;
2219 //  const int pad_height = params.padding_values.height;
2220 //  const int output_shift = params.output_shift;
2221 //
2222 //  // Call kernel optimized for depthwise convolutions using 3x3 filters if
2223 //  // parameters are supported.
2224 //  if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width,
2225 //                                                   stride_height, dilation_width_factor,
2226 //                                                   dilation_height_factor, pad_width, pad_height,
2227 //                                                   depth_multiplier, output_shape, output_shift))
2228 //  {
2229 //    depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
2230 //        params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
2231 //        output_shape, output_data, thread_start, thread_end, thread_dim);
2232 //    return;
2233 //  }
2234 #endif
2235
2236   depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
2237                                        bias_shape, bias_data, output_shape, output_data,
2238                                        thread_start, thread_end, thread_dim);
2239 }
2240
2241 inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
2242                               const uint8_t *input_data, const Shape &filter_shape,
2243                               const uint8_t *filter_data, const Shape &bias_shape,
2244                               const int32_t *bias_data, const Shape &output_shape,
2245                               uint8_t *output_data, int thread_start, int thread_end,
2246                               int thread_dim)
2247 {
2248   return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data,
2249                                    bias_shape, bias_data, output_shape, output_data, thread_start,
2250                                    thread_end, thread_dim);
2251 }
2252
2253 } // namespace optimized
2254 } // namespace cker
2255 } // namespace nnfw
2256
2257 #endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__