2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
19 #define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
21 #include "cker/Shape.h"
22 #include "cker/Types.h"
23 #include "cker/Utils.h"
24 #include "cker/neon/neon_check.h"
26 #include <fixedpoint/fixedpoint.h>
27 #include <public/gemmlowp.h>
36 // Implementation of quantized DepthwiseConv
38 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
39 struct QuantizedDepthwiseConvKernel
44 template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
46 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
47 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
48 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
51 (void)depth_multiplier;
52 // Load the filters, add filter_offset.
53 uint8x8x2_t filter_u8;
54 filter_u8.val[0] = vld1_u8(filter_ptr);
55 filter_u8.val[1] = vld1_u8(filter_ptr + 8);
57 for (int i = 0; i < 2; i++)
60 vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
62 // Handle one output pixel at a time.
63 for (int outp = 0; outp < num_output_pixels; outp++)
65 // Load the accumulators from acc_buffer
67 for (int i = 0; i < 2; i++)
69 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
70 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
72 // Load the inputs, add input_offset.
73 const uint8x8_t input_u8 = vld1_u8(input_ptr);
74 input_ptr += input_ptr_increment;
75 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
76 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
77 // Duplicate the input values, 2-fold
78 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
79 // Multiply-accumulate
80 for (int i = 0; i < 2; i++)
83 vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
85 vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
87 // Store the accumulators back to acc_buffer
88 for (int i = 0; i < 2; i++)
90 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
91 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
98 template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
100 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
101 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
102 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
105 (void)depth_multiplier;
106 (void)input_ptr_increment;
107 // Load the filters, add filter_offset.
108 const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
109 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
110 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
113 // Handle 2 output pixels at a time.
114 for (; outp <= num_output_pixels - 2; outp += 2)
116 // Load the accumulators from acc_buffer.
118 for (int i = 0; i < 4; i++)
120 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
122 // Load the inputs, add input_offset.
123 uint8x8_t input_u8[2];
124 for (int i = 0; i < 2; i++)
126 input_u8[i] = vld1_u8(input_ptr + 8 * i);
130 for (int i = 0; i < 2; i++)
132 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
134 for (int i = 0; i < 2; i++)
136 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
138 // Multiply-accumulate.
139 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
140 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
141 acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
142 acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
143 // Store the accumulators back to acc_buffer
144 for (int i = 0; i < 4; i++)
146 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
148 acc_buffer_ptr += 16;
150 // Handle 1 output pixel at a time.
151 for (; outp < num_output_pixels; outp++)
153 // Load the accumulators from acc_buffer.
155 acc[0] = vld1q_s32(acc_buffer_ptr);
156 acc[1] = vld1q_s32(acc_buffer_ptr + 4);
158 // Load the inputs, add input_offset.
159 const uint8x8_t input_u8 = vld1_u8(input_ptr);
161 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
162 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
163 // Multiply-accumulate.
164 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
165 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
166 // Store the accumulators back to acc_buffer
167 vst1q_s32(acc_buffer_ptr, acc[0]);
168 vst1q_s32(acc_buffer_ptr + 4, acc[1]);
174 template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
176 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
177 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
178 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
181 (void)depth_multiplier;
182 (void)input_ptr_increment;
183 // Load the filters, add filter_offset.
184 const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
185 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
186 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
189 // Handle 2 output pixels at a time.
190 for (; outp <= num_output_pixels - 2; outp += 2)
192 // Load the accumulators from acc_buffer
194 for (int i = 0; i < 4; i++)
196 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
198 // Load the inputs, add input_offset.
199 const uint8x8_t input_u8 = vld1_u8(input_ptr);
201 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
202 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
203 // Duplicate the input values, 2-fold
204 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
205 // Multiply-accumulate
206 for (int i = 0; i < 2; i++)
209 vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
211 vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
213 // Store the accumulators back to acc_buffer
214 for (int i = 0; i < 4; i++)
216 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
218 acc_buffer_ptr += 16;
220 // Handle one output pixel at a time.
221 for (; outp < num_output_pixels; outp++)
223 // Load the accumulators from acc_buffer
225 for (int i = 0; i < 2; i++)
227 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
229 // Load the inputs, add input_offset.
230 uint8x8_t input_u8 = vdup_n_u8(0);
231 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
232 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
233 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
234 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
236 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
237 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
238 // Duplicate the input values, 2-fold
239 const int16x4x2_t input_dup2 = vzip_s16(input, input);
240 // Multiply-accumulate
241 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
242 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
243 // Store the accumulators back to acc_buffer
244 for (int i = 0; i < 2; i++)
246 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
253 template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
255 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
256 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
257 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
260 (void)depth_multiplier;
261 (void)input_ptr_increment;
262 // Load the filters, add filter_offset.
264 for (int i = 0; i < 2; i++)
266 const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
267 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
268 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
271 // Handle two output pixels at a time.
272 for (; outp <= num_output_pixels - 2; outp += 2)
274 // Load the accumulators from acc_buffer.
276 for (int i = 0; i < 8; i++)
278 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
280 // Load the inputs, add input_offset.
281 uint8x8_t input_u8 = vdup_n_u8(0);
282 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
283 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
284 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
285 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
287 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
288 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
289 // Multiply-accumulate.
290 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
291 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
292 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
293 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
294 acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
295 acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
296 acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
297 acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
298 // Store the accumulators back to acc_buffer.
299 for (int i = 0; i < 8; i++)
301 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
303 acc_buffer_ptr += 32;
305 // Handle one output pixel at a time.
306 for (; outp < num_output_pixels; outp++)
308 // Load the accumulators from acc_buffer.
310 for (int i = 0; i < 4; i++)
312 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
314 // Load the inputs, add input_offset.
315 uint8x8_t input_u8 = vdup_n_u8(0);
316 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
317 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
319 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
320 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
322 // Multiply-accumulate.
323 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
324 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
325 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
326 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
328 // Store the accumulators back to acc_buffer.
329 for (int i = 0; i < 4; i++)
331 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
333 acc_buffer_ptr += 16;
338 template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
340 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
341 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
342 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
345 (void)depth_multiplier;
346 (void)input_ptr_increment;
347 // Load the filters, add filter_offset.
348 uint8x8_t filter_u8 = vdup_n_u8(0);
349 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
350 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
351 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
352 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
353 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
354 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
357 // Handle 4 output pixels at a time.
358 for (; outp <= num_output_pixels - 4; outp += 4)
360 // Load the accumulators from acc_buffer
362 for (int i = 0; i < 4; i++)
364 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
367 // Load the inputs, add input_offset.
368 const uint8x8_t input_u8 = vld1_u8(input_ptr);
370 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
371 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
372 // Duplicate the input values, 2-fold
373 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
374 // Multiply-accumulate
375 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
376 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
377 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
378 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
379 // Store the accumulators back to acc_buffer
380 for (int i = 0; i < 4; i++)
382 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
384 acc_buffer_ptr += 16;
386 // Handle one output pixel at a time.
387 for (; outp < num_output_pixels; outp++)
389 // Load the accumulators from acc_buffer
390 int32x4_t acc = vld1q_s32(acc_buffer_ptr);
392 uint8x8_t input_u8 = vdup_n_u8(0);
393 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
394 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
396 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
397 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
398 // Duplicate the input values, 2-fold
399 const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
400 // Multiply-accumulate
401 acc = vmlal_s16(acc, filter, input_dup2);
402 // Store the accumulators back to acc_buffer
403 vst1q_s32(acc_buffer_ptr, acc);
409 template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
411 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
412 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
413 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
416 (void)depth_multiplier;
417 (void)input_ptr_increment;
418 // Load the filters, add filter_offset.
419 uint8x8_t filter_u8 = vdup_n_u8(0);
420 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
421 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
422 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
423 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
424 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
425 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
428 // Handle 8 output pixels at a time.
429 for (; outp <= num_output_pixels - 8; outp += 8)
431 // Load the accumulators from acc_buffer.
433 for (int i = 0; i < 4; i++)
435 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
437 // Load the inputs, add input_offset.
438 uint8x8_t input_u8[2];
439 for (int i = 0; i < 2; i++)
441 input_u8[i] = vld1_u8(input_ptr + 8 * i);
445 for (int i = 0; i < 2; i++)
447 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
449 for (int i = 0; i < 2; i++)
451 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
454 // Multiply-accumulate.
455 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
456 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
457 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
458 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
459 // Store the accumulators back to acc_buffer.
460 for (int i = 0; i < 4; i++)
462 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
464 acc_buffer_ptr += 16;
466 // Handle 4 output pixels at a time.
467 for (; outp <= num_output_pixels - 4; outp += 4)
469 // Load the accumulators from acc_buffer.
471 for (int i = 0; i < 2; i++)
473 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
475 // Load the inputs, add input_offset.
476 const uint8x8_t input_u8 = vld1_u8(input_ptr);
478 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
479 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
481 // Multiply-accumulate.
482 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
483 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
484 // Store the accumulators back to acc_buffer.
485 for (int i = 0; i < 2; i++)
487 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
491 // Handle 2 output pixels at a time.
492 for (; outp <= num_output_pixels - 2; outp += 2)
494 // Load the accumulators from acc_buffer.
495 int32x4_t acc = vld1q_s32(acc_buffer_ptr);
496 // Load the inputs, add input_offset.
497 uint8x8_t input_u8 = vdup_n_u8(0);
498 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
499 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
500 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
501 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
503 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
504 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
506 // Multiply-accumulate.
507 acc = vmlal_s16(acc, filter, input);
508 // Store the accumulators back to acc_buffer.
509 vst1q_s32(acc_buffer_ptr, acc);
512 // Handle 1 output pixel at a time.
513 for (; outp < num_output_pixels; outp++)
515 // Load the accumulators from acc_buffer.
516 int32x2_t acc = vld1_s32(acc_buffer_ptr);
517 // Load the inputs, add input_offset.
518 uint8x8_t input_u8 = vdup_n_u8(0);
519 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
520 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
522 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
523 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
525 // Multiply-accumulate.
526 acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
527 // Store the accumulators back to acc_buffer.
528 vst1_s32(acc_buffer_ptr, acc);
534 template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
536 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
537 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
538 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
541 (void)depth_multiplier;
542 (void)input_ptr_increment;
543 // Load the filters, add filter_offset.
544 uint8x8_t filter_u8 = vdup_n_u8(0);
545 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
546 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
547 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
548 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
549 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
550 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
553 // Handle 8 output pixels at a time.
554 for (; outp <= num_output_pixels - 8; outp += 8)
556 // Load the accumulators from acc_buffer
558 for (int i = 0; i < 4; i++)
560 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
563 // Load the inputs, add input_offset.
564 const uint8x8_t input_u8 = vld1_u8(input_ptr);
566 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
567 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
568 // Duplicate the input values, 2-fold
569 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
570 // Multiply-accumulate
571 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
572 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
573 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
574 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
575 // Store the accumulators back to acc_buffer
576 for (int i = 0; i < 4; i++)
578 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
580 acc_buffer_ptr += 16;
582 // Handle one output pixel at a time.
583 for (; outp < num_output_pixels; outp++)
585 // Load the accumulators from acc_buffer
586 int32x2_t acc = vld1_s32(acc_buffer_ptr);
588 // Load the inputs, add input_offset.
589 const uint32_t input = *input_ptr++ + input_offset;
591 // Multiply-accumulate
592 acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
593 // Store the accumulators back to acc_buffer
594 vst1_s32(acc_buffer_ptr, acc);
600 template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
602 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
603 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
604 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
607 (void)depth_multiplier;
608 (void)input_ptr_increment;
609 // Load the filters, add filter_offset.
610 uint8x8_t filter_u8 = vdup_n_u8(0);
611 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
612 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
613 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
614 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
615 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
616 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
619 // Handle 8 output pixels at a time.
620 for (; outp <= num_output_pixels - 8; outp += 8)
622 // Load the accumulators from acc_buffer
624 for (int i = 0; i < 8; i++)
626 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
629 // Load the inputs, add input_offset.
630 uint8x8_t input_u8 = vld1_u8(input_ptr);
632 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
633 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
635 // Multiply-accumulate
636 acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
637 acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
638 acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
639 acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
640 acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
641 acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
642 acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
643 acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
645 // Store the accumulators back to acc_buffer
646 for (int i = 0; i < 8; i++)
648 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
650 acc_buffer_ptr += 32;
652 // Handle 4 output pixels at a time.
653 for (; outp <= num_output_pixels - 4; outp += 4)
655 // Load the accumulators from acc_buffer
657 for (int i = 0; i < 4; i++)
659 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
662 // Load the inputs, add input_offset.
663 uint8x8_t input_u8 = vdup_n_u8(0);
664 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
665 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
666 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
667 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
669 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
670 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
672 // Multiply-accumulate
673 acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
674 acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
675 acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
676 acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
678 // Store the accumulators back to acc_buffer
679 for (int i = 0; i < 4; i++)
681 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
683 acc_buffer_ptr += 16;
685 // Handle one output pixel at a time.
686 for (; outp < num_output_pixels; outp++)
688 // Load the accumulators from acc_buffer
689 int32x4_t acc = vld1q_s32(acc_buffer_ptr);
691 // Load the inputs, add input_offset.
692 const uint32_t input = *input_ptr++ + input_offset;
694 // Multiply-accumulate
695 acc = vmlal_n_s16(acc, filter, input);
696 // Store the accumulators back to acc_buffer
697 vst1q_s32(acc_buffer_ptr, acc);
703 template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
705 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
706 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
707 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
710 (void)depth_multiplier;
711 (void)input_ptr_increment;
712 // Load the filters, add filter_offset.
713 uint8x8_t filter_u8 = vdup_n_u8(0);
714 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
715 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
716 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
717 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
718 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
719 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
722 // Handle 4 output pixels at a time.
723 for (; outp <= num_output_pixels - 4; outp += 4)
725 // Load the accumulators from acc_buffer
727 for (int i = 0; i < 4; i++)
729 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
731 // Load the inputs, add input_offset.
733 for (int i = 0; i < 2; i++)
735 const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
736 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
737 input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
740 // Multiply-accumulate
741 for (int i = 0; i < 2; i++)
743 acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
744 acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
746 // Store the accumulators back to acc_buffer
747 for (int i = 0; i < 4; i++)
749 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
751 acc_buffer_ptr += 16;
753 // Handle one output pixel at a time.
754 for (; outp < num_output_pixels; outp++)
756 // Load the accumulators from acc_buffer
758 acc = vld1q_s32(acc_buffer_ptr);
760 // Load the inputs, add input_offset.
761 uint8x8_t input_u8 = vdup_n_u8(0);
762 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
763 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
764 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
765 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
767 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
768 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
769 // Multiply-accumulate
770 acc = vmlal_s16(acc, filter, input);
771 // Store the accumulators back to acc_buffer
772 vst1q_s32(acc_buffer_ptr, acc);
778 template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
780 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
781 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
782 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
785 (void)depth_multiplier;
786 (void)input_ptr_increment;
787 // Load the filters, add filter_offset.
789 for (int i = 0; i < 2; i++)
791 const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
792 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
793 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
797 // Handle 2 output pixels at a time.
798 for (; outp <= num_output_pixels - 2; outp += 2)
800 // Load the accumulators from acc_buffer
802 for (int i = 0; i < 8; i++)
804 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
807 // Load the inputs, add input_offset.
808 uint8x8_t input_u8 = vld1_u8(input_ptr);
810 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
811 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
813 // Multiply-accumulate
814 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);
815 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);
816 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);
817 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);
818 acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);
819 acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);
820 acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);
821 acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);
822 // Store the accumulators back to acc_buffer
823 for (int i = 0; i < 8; i++)
825 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
827 acc_buffer_ptr += 32;
829 // Handle one output pixel at a time.
830 for (; outp < num_output_pixels; outp++)
832 // Load the accumulators from acc_buffer
834 for (int i = 0; i < 4; i++)
836 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
839 // Load the inputs, add input_offset.
840 uint8x8_t input_u8 = vdup_n_u8(0);
841 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
842 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
843 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
844 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
846 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
847 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
849 // Multiply-accumulate
850 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
851 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
852 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
853 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
854 // Store the accumulators back to acc_buffer
855 for (int i = 0; i < 4; i++)
857 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
859 acc_buffer_ptr += 16;
864 template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
866 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
867 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
868 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
871 (void)depth_multiplier;
872 // We will have to duplicate bytes in a NEON register, 3-fold.
873 // We will do that by register-level table-look-up using VTBL instructions.
874 // Here we prepare the registers containing the table-lookup indices.
875 static const uint8_t dup3_indices_array[3][8] = {
876 {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
877 uint8x8_t dup3_indices[3];
878 for (int i = 0; i < 3; i++)
880 dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
883 // Handle one output pixel at a time.
884 for (int outp = 0; outp < num_output_pixels; outp++)
886 const uint8_t *local_filter_ptr = filter_ptr;
887 const uint8_t *local_input_ptr = input_ptr;
889 // Handle 8 input channels at a time.
890 for (; ic <= input_depth - 8; ic += 8)
892 // Load the filters, add filter_offset.
894 uint8x8x3_t filter_u8;
895 filter_u8.val[0] = vld1_u8(local_filter_ptr);
896 filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
897 filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
898 local_filter_ptr += 24;
899 for (int i = 0; i < 3; i++)
901 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
902 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
904 // Load the inputs, duplicate 3-fold, add input_offset.
905 const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
906 local_input_ptr += 8;
908 uint8x8_t input_u8_dup3[3];
909 for (int i = 0; i < 3; i++)
911 input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
913 int16x8_t input_dup3[3];
914 for (int i = 0; i < 3; i++)
916 const int16x8_t input_s16_dup3 = vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
917 input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
919 // Load the accumulators from acc_buffer
921 for (int i = 0; i < 2; i++)
923 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
924 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
925 acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
927 // Multiply-accumulate
928 for (int j = 0; j < 3; j++)
931 vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
933 vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
935 // Store the accumulators back to acc_buffer
936 for (int i = 0; i < 2; i++)
938 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
939 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
940 vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
942 acc_buffer_ptr += 24;
944 // Handle one input channel at a time.
945 for (; ic < input_depth; ic++)
947 const uint16_t input_val = *local_input_ptr++ + input_offset;
948 for (int i = 0; i < 3; i++)
950 const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
951 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
953 local_filter_ptr += 3;
955 input_ptr += input_ptr_increment;
960 template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
962 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
963 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
964 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
967 (void)depth_multiplier;
968 // Handle one output pixel at a time.
969 for (int outp = 0; outp < num_output_pixels; outp++)
971 const uint8_t *local_filter_ptr = filter_ptr;
972 const uint8_t *local_input_ptr = input_ptr;
974 // Handle 8 input channels at a time.
975 for (; ic <= input_depth - 8; ic += 8)
977 // Load the filters, add filter_offset.
979 uint8x8x2_t filter_u8;
980 filter_u8.val[0] = vld1_u8(local_filter_ptr);
981 filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
982 local_filter_ptr += 16;
983 for (int i = 0; i < 2; i++)
985 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
986 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
988 // Load the inputs, add input_offset, duplicate 2-fold.
989 const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
990 local_input_ptr += 8;
991 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
992 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
993 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
994 // Load the accumulators from acc_buffer.
996 for (int i = 0; i < 2; i++)
998 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
999 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
1001 // Multiply-accumulate.
1002 for (int j = 0; j < 2; j++)
1005 vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
1007 vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
1009 // Store the accumulators back to acc_buffer.
1010 for (int i = 0; i < 2; i++)
1012 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
1013 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
1015 acc_buffer_ptr += 16;
1017 // Handle one input channel at a time.
1018 for (; ic < input_depth; ic++)
1021 const uint16_t input_val = *local_input_ptr++ + input_offset;
1022 for (int i = 0; i < 2; i++)
1024 const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
1025 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1027 local_filter_ptr += 2;
1029 input_ptr += input_ptr_increment;
1034 template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
1036 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1037 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1038 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1041 (void)depth_multiplier;
1042 // Handle one output pixel at a time.
1043 for (int outp = 0; outp < num_output_pixels; outp++)
1045 const uint8_t *local_filter_ptr = filter_ptr;
1046 const uint8_t *local_input_ptr = input_ptr;
1048 // Handle 16 input channels at a time.
1049 for (; ic <= input_depth - 16; ic += 16)
1051 // Load the filters, add filter_offset.
1052 uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
1053 uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
1054 local_filter_ptr += 16;
1055 int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1056 int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1057 filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1058 filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1059 // Load the inputs, add input_offset.
1060 uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
1061 uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
1062 local_input_ptr += 16;
1063 int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
1064 int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
1065 input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
1066 input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
1067 // Load the accumulators from acc_buffer
1068 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1069 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1070 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1071 int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1072 acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
1073 acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
1074 acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
1075 acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
1076 // Store the accumulators back to acc_buffer
1077 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1078 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1079 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1080 vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1081 acc_buffer_ptr += 16;
1083 // Handle 8 input channels at a time.
1084 for (; ic <= input_depth - 8; ic += 8)
1086 // Load the filters, add filter_offset.
1087 const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
1088 local_filter_ptr += 8;
1089 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
1090 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
1091 // Load the inputs, add input_offset.
1092 const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
1093 local_input_ptr += 8;
1094 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1095 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1096 // Load the accumulators from acc_buffer
1098 for (int i = 0; i < 2; i++)
1100 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1102 // Multiply-accumulate
1103 acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
1104 acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
1105 // Store the accumulators back to acc_buffer
1106 for (int i = 0; i < 2; i++)
1108 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1110 acc_buffer_ptr += 8;
1112 // Handle one input channel at a time.
1113 for (; ic < input_depth; ic++)
1115 const uint16_t input_val = *local_input_ptr++ + input_offset;
1116 const uint16_t filter_val = *local_filter_ptr++ + filter_offset;
1117 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1119 input_ptr += input_ptr_increment;
1124 template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
1126 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1127 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1128 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1131 (void)depth_multiplier;
1132 // Load the filters, add filter_offset.
1133 uint8x8_t filter_u8[2];
1134 for (int i = 0; i < 2; i++)
1136 filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
1138 int16x8_t filter[2];
1139 for (int i = 0; i < 2; i++)
1141 filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
1143 for (int i = 0; i < 2; i++)
1145 filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
1147 // Handle one output pixel at a time.
1148 for (int outp = 0; outp < num_output_pixels; outp++)
1150 // Load the inputs, add input_offset.
1151 uint8x8_t input_u8[2];
1152 for (int i = 0; i < 2; i++)
1154 input_u8[i] = vld1_u8(input_ptr + 8 * i);
1156 input_ptr += input_ptr_increment;
1158 for (int i = 0; i < 2; i++)
1160 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
1162 for (int i = 0; i < 2; i++)
1164 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
1166 // Load the accumulators from acc_buffer
1168 for (int i = 0; i < 4; i++)
1170 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1172 // Multiply-accumulate
1173 for (int i = 0; i < 2; i++)
1175 acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
1177 vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
1179 // Store the accumulators back to acc_buffer
1180 for (int i = 0; i < 4; i++)
1182 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1184 acc_buffer_ptr += 16;
1189 template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
1191 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1192 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1193 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1196 (void)depth_multiplier;
1197 // Load the filters, add filter_offset.
1198 const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
1199 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
1200 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
1201 // Handle one output pixel at a time.
1202 for (int outp = 0; outp < num_output_pixels; outp++)
1204 // Load the inputs, add input_offset.
1205 const uint8x8_t input_u8 = vld1_u8(input_ptr);
1206 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1207 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1208 // Load the accumulators from acc_buffer
1210 for (int i = 0; i < 2; i++)
1212 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1214 // Multiply-accumulate
1215 acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
1216 acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
1217 // Store the accumulators back to acc_buffer
1218 for (int i = 0; i < 2; i++)
1220 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1222 acc_buffer_ptr += 8;
1223 input_ptr += input_ptr_increment;
1228 template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
1230 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1231 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1232 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1235 (void)depth_multiplier;
1236 // Load the filters, add filter_offset.
1237 uint8x8_t filter_u8[2];
1238 for (int i = 0; i < 2; i++)
1240 filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
1242 int16x8_t filter[2];
1243 for (int i = 0; i < 2; i++)
1245 filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
1247 for (int i = 0; i < 2; i++)
1249 filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
1251 // Handle one output pixel at a time.
1252 for (int outp = 0; outp < num_output_pixels; outp++)
1254 uint8_t input_u8 = *input_ptr;
1255 input_ptr += input_ptr_increment;
1256 uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
1257 // Load the accumulators from acc_buffer
1259 for (int i = 0; i < 4; i++)
1261 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1263 // Multiply-accumulate
1264 for (int i = 0; i < 2; i++)
1266 acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
1267 acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
1269 // Store the accumulators back to acc_buffer
1270 for (int i = 0; i < 4; i++)
1272 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1274 acc_buffer_ptr += 16;
1279 template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
1281 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1282 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1283 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1286 (void)depth_multiplier;
1287 // Load the filters, add filter_offset.
1288 uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
1289 uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
1290 uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
1291 uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
1292 int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1293 int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1294 int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
1295 int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
1296 filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1297 filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1298 filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
1299 filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
1300 // Handle one output pixel at a time.
1301 for (int outp = 0; outp < num_output_pixels; outp++)
1303 uint8_t input_u8 = *input_ptr;
1304 input_ptr += input_ptr_increment;
1305 uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
1306 // Load the accumulators from acc_buffer
1307 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1308 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1309 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1310 int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1311 int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
1312 int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
1313 int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
1314 int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
1315 // Multiply-accumulate
1316 acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
1317 acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
1318 acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
1319 acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
1320 acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
1321 acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
1322 acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
1323 acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
1324 // Store the accumulators back to acc_buffer
1325 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1326 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1327 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1328 vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1329 vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
1330 vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
1331 vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
1332 vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
1333 acc_buffer_ptr += 32;
1338 template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
1340 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1341 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1342 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1345 (void)depth_multiplier;
1346 // Load the filters, add filter_offset.
1347 // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
1348 // We load the first 16 bytes into filter_u8_{0,1} as usual.
1349 // Then we load the 8 last bytes into filter_u8_x (x for 'extra').
1350 // This is redundant: the first 4 bytes of filter_u8_x are the same
1351 // as the last 4 bytes of filter_u8_x.
1352 uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
1353 uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
1354 uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
1355 int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1356 int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1357 int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
1358 filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1359 filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1360 filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
1361 // Handle one output pixel at a time.
1362 for (int outp = 0; outp < num_output_pixels; outp++)
1364 uint8_t input_u8 = *input_ptr;
1365 input_ptr += input_ptr_increment;
1366 uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
1367 // Load the accumulators from acc_buffer
1368 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1369 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1370 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1371 int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1372 int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
1373 // Multiply-accumulate
1374 acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
1375 acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
1376 acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
1377 acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
1378 acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
1379 // Store the accumulators back to acc_buffer
1380 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1381 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1382 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1383 vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1384 vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
1385 acc_buffer_ptr += 20;
1390 template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
1392 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1393 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1394 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1397 (void)depth_multiplier;
1398 // Load the filters, add filter_offset.
1399 const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
1400 const int16x8_t filter =
1401 vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
1402 // Handle one output pixel at a time.
1403 for (int outp = 0; outp < num_output_pixels; outp++)
1405 uint8_t input_u8 = *input_ptr;
1406 input_ptr += input_ptr_increment;
1407 uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
1408 // Load the accumulators from acc_buffer
1410 for (int i = 0; i < 2; i++)
1412 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1414 // Multiply-accumulate
1415 acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
1416 acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
1417 // Store the accumulators back to acc_buffer
1418 for (int i = 0; i < 2; i++)
1420 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1422 acc_buffer_ptr += 8;
1427 template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
1429 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1430 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1431 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1434 (void)depth_multiplier;
1435 // Load the filters, add filter_offset.
1436 uint8x8_t filter_u8 = vdup_n_u8(0);
1437 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
1438 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
1439 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
1440 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
1441 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
1442 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
1446 // Handle 2 output pixels at a time.
1447 for (; outp <= num_output_pixels - 2; outp += 2)
1449 // Load the accumulators from acc_buffer.
1450 int32x4_t acc = vld1q_s32(acc_buffer_ptr);
1451 // Load the inputs, add input_offset.
1452 uint16x4_t input_u16 = vdup_n_u16(0);
1453 input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 0);
1454 input_ptr += input_ptr_increment;
1455 input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);
1456 input_ptr += input_ptr_increment;
1457 const int16x4_t input_s16 =
1458 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
1459 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1461 // Multiply-accumulate.
1462 acc = vmlal_s16(acc, filter, input);
1463 // Store the accumulators back to acc_buffer.
1464 vst1q_s32(acc_buffer_ptr, acc);
1465 acc_buffer_ptr += 4;
1468 // Handle 1 output pixel at a time.
1469 for (; outp < num_output_pixels; outp++)
1471 // Load the accumulators from acc_buffer.
1472 int32x2_t acc = vld1_s32(acc_buffer_ptr);
1473 // Load the inputs, add input_offset.
1474 uint8x8_t input_u8 = vdup_n_u8(0);
1475 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
1476 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
1477 input_ptr += input_ptr_increment;
1478 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1479 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1481 // Multiply-accumulate.
1482 acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
1483 // Store the accumulators back to acc_buffer.
1484 vst1_s32(acc_buffer_ptr, acc);
1485 acc_buffer_ptr += 2;
1490 template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
1492 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1493 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1494 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1497 (void)depth_multiplier;
1498 if (num_output_pixels <= 0)
1503 // Load the filters, add filter_offset.
1504 uint8x8_t filter_u8 = vdup_n_u8(0);
1505 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
1506 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
1507 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
1508 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
1509 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
1510 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
1514 // Handle one output pixel at a time until second to the last pixel. Second
1515 // to the last because we read eight input pixels while only processing
1517 for (; outp < num_output_pixels - 1; outp++)
1519 // Load the accumulators from acc_buffer
1521 acc = vld1q_s32(acc_buffer_ptr);
1523 // Load the inputs, add input_offset.
1524 uint8x8_t input_u8 = vld1_u8(input_ptr);
1525 input_ptr += input_ptr_increment;
1526 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1527 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1528 // Multiply-accumulate
1529 acc = vmlal_s16(acc, filter, input);
1530 // Store the accumulators back to acc_buffer
1531 vst1q_s32(acc_buffer_ptr, acc);
1532 acc_buffer_ptr += 4;
1535 // Handle the last output pixel.
1536 // Load the accumulators from acc_buffer
1538 acc = vld1q_s32(acc_buffer_ptr);
1540 // Load the inputs, add input_offset.
1541 uint8x8_t input_u8 = vdup_n_u8(0);
1542 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
1543 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
1544 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
1545 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
1546 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1547 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1548 // Multiply-accumulate
1549 acc = vmlal_s16(acc, filter, input);
1550 // Store the accumulators back to acc_buffer
1551 vst1q_s32(acc_buffer_ptr, acc);
1555 template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
1557 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1558 const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
1559 const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
1562 (void)depth_multiplier;
1563 // Load the filters, add filter_offset.
1564 uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
1565 uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
1566 int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1567 int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1568 filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
1569 filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
1570 int16x4_t filter_0 = vget_low_s16(filter_s16_0);
1571 int16x4_t filter_1 = vget_high_s16(filter_s16_0);
1572 int16x4_t filter_2 = vget_high_s16(filter_s16_1);
1574 // Handle one output pixel at a time.
1575 for (int outp = 0; outp < num_output_pixels; outp++)
1577 // Load the inputs, add input_offset.
1578 uint8x8_t input_u8_0 = vld1_u8(input_ptr);
1579 uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
1580 input_ptr += input_ptr_increment;
1581 int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
1582 int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
1583 input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
1584 input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
1586 // Load the accumulators from acc_buffer
1587 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1588 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1589 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1591 // Multiply-accumulate
1592 acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
1593 acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
1594 acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
1596 // Store the accumulators back to acc_buffer
1597 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1598 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1599 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1601 acc_buffer_ptr += 12;
1607 // Accumulates the effect of one row of the filter, on a segment of one row
1608 // of the output, accessing the corresponding one row of the input.
1609 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
1610 void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,
1611 int input_width, const uint8_t *input_data,
1612 int16_t input_offset, int pad_width, int depth_multiplier,
1613 int filter_width, const uint8_t *filter_data,
1614 int16_t filter_offset, int out_x_buffer_start,
1615 int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
1617 // Sanity check parameters. This is important in particular to ensure
1618 // that we keep the number of template instantiations minimal, so we don't
1619 // increase binary size unnecessarily.
1620 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
1621 static_assert(kFixedInputDepth || kAllowStrided, "");
1622 assert(stride == 1 || kAllowStrided);
1623 if (kFixedInputDepth)
1625 assert(input_depth == kFixedInputDepth);
1627 if (kFixedDepthMultiplier)
1629 assert(depth_multiplier == kFixedDepthMultiplier);
1631 assert(output_depth == input_depth * depth_multiplier);
1632 const int input_ptr_increment = stride * input_depth;
1633 const uint8_t *filter_base_ptr = filter_data;
1634 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1636 // For the current (filter_x, filter_y) point in the filter,
1637 // compute the boundaries of the corresponding output row segment.
1638 int out_x_loop_start_unclampled = 0;
1639 int out_x_loop_end_unclampled = 0;
1644 out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 1) / 2;
1645 out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
1647 else if (stride == 4)
1649 out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 3) / 4;
1650 out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
1654 out_x_loop_start_unclampled =
1655 (pad_width - dilation_factor * filter_x + stride - 1) / stride;
1656 out_x_loop_end_unclampled =
1657 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
1662 out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
1663 out_x_loop_end_unclampled = pad_width + input_width - dilation_factor * filter_x;
1665 // The kernel will have to iterate on the segment of the
1666 // output row that starts at out_x_loop_start and out_x_loop_end.
1667 const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled);
1668 const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled);
1670 int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1671 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1672 const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1673 const int num_output_pixels = out_x_loop_end - out_x_loop_start;
1674 QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
1675 num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
1676 input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
1677 filter_base_ptr += output_depth;
1681 // generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
1682 inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
1683 int input_width, const uint8_t *input_data,
1684 int16_t input_offset, int pad_width,
1685 int depth_multiplier, int filter_width,
1686 const uint8_t *filter_data, int16_t filter_offset,
1687 int out_x_buffer_start, int out_x_buffer_end,
1688 int output_depth, int32_t *acc_buffer)
1690 const uint8_t *filter_base_ptr = filter_data;
1691 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1693 const int out_x_loop_start = std::max(
1694 out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
1695 const int out_x_loop_end =
1696 std::min(out_x_buffer_end,
1697 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
1699 int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1700 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1701 const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1702 const int input_ptr_increment = (stride - 1) * input_depth;
1703 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
1705 const uint8_t *filter_ptr = filter_base_ptr;
1706 for (int ic = 0; ic < input_depth; ++ic)
1708 const int16_t input_val = *input_ptr++ + input_offset;
1709 for (int m = 0; m < depth_multiplier; m++)
1711 const int16_t filter_val = *filter_ptr++ + filter_offset;
1712 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1715 input_ptr += input_ptr_increment;
1717 filter_base_ptr += output_depth;
1721 // Initializes the accumulator buffer with bias values.
1722 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
1723 const int32_t *bias_data, int32_t *acc_buffer)
1727 if (output_depth == 1)
1729 const int32x4_t b = vdupq_n_s32(bias_data[0]);
1730 for (; i <= num_output_pixels - 16; i += 16)
1732 vst1q_s32(acc_buffer + i + 0, b);
1733 vst1q_s32(acc_buffer + i + 4, b);
1734 vst1q_s32(acc_buffer + i + 8, b);
1735 vst1q_s32(acc_buffer + i + 12, b);
1737 for (; i <= num_output_pixels - 4; i += 4)
1739 vst1q_s32(acc_buffer + i, b);
1742 else if (output_depth == 2)
1744 int32x4_t b = vdupq_n_s32(bias_data[0]);
1745 b = vsetq_lane_s32(bias_data[1], b, 1);
1746 b = vsetq_lane_s32(bias_data[1], b, 3);
1747 for (; i <= num_output_pixels - 8; i += 8)
1749 vst1q_s32(acc_buffer + 2 * i + 0, b);
1750 vst1q_s32(acc_buffer + 2 * i + 4, b);
1751 vst1q_s32(acc_buffer + 2 * i + 8, b);
1752 vst1q_s32(acc_buffer + 2 * i + 12, b);
1754 for (; i <= num_output_pixels - 2; i += 2)
1756 vst1q_s32(acc_buffer + 2 * i, b);
1759 else if (output_depth == 4)
1761 const int32x4_t b = vld1q_s32(bias_data);
1762 for (; i <= num_output_pixels - 4; i += 4)
1764 vst1q_s32(acc_buffer + 4 * i + 0, b);
1765 vst1q_s32(acc_buffer + 4 * i + 4, b);
1766 vst1q_s32(acc_buffer + 4 * i + 8, b);
1767 vst1q_s32(acc_buffer + 4 * i + 12, b);
1769 for (; i < num_output_pixels; i++)
1771 vst1q_s32(acc_buffer + 4 * i, b);
1774 else if (output_depth == 8)
1776 const int32x4_t b0 = vld1q_s32(bias_data);
1777 const int32x4_t b1 = vld1q_s32(bias_data + 4);
1778 for (; i <= num_output_pixels - 2; i += 2)
1780 vst1q_s32(acc_buffer + 8 * i + 0, b0);
1781 vst1q_s32(acc_buffer + 8 * i + 4, b1);
1782 vst1q_s32(acc_buffer + 8 * i + 8, b0);
1783 vst1q_s32(acc_buffer + 8 * i + 12, b1);
1785 for (; i < num_output_pixels; i++)
1787 vst1q_s32(acc_buffer + 8 * i + 0, b0);
1788 vst1q_s32(acc_buffer + 8 * i + 4, b1);
1791 else if (output_depth == 16)
1793 const int32x4_t b0 = vld1q_s32(bias_data);
1794 const int32x4_t b1 = vld1q_s32(bias_data + 4);
1795 const int32x4_t b2 = vld1q_s32(bias_data + 8);
1796 const int32x4_t b3 = vld1q_s32(bias_data + 12);
1797 for (; i < num_output_pixels; i++)
1799 vst1q_s32(acc_buffer + 16 * i + 0, b0);
1800 vst1q_s32(acc_buffer + 16 * i + 4, b1);
1801 vst1q_s32(acc_buffer + 16 * i + 8, b2);
1802 vst1q_s32(acc_buffer + 16 * i + 12, b3);
1806 for (; i < num_output_pixels; i++)
1808 memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
1812 inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape &input_shape,
1813 const uint8_t *input_data, const Shape &filter_shape,
1814 const uint8_t *filter_data, const Shape &bias_shape,
1815 const int32_t *bias_data, const Shape &output_shape,
1816 uint8_t *output_data)
1819 const int stride_width = params.stride_width;
1820 const int stride_height = params.stride_height;
1821 const int pad_width = params.padding_values.width;
1822 const int pad_height = params.padding_values.height;
1823 const int depth_multiplier = params.depth_multiplier;
1824 const int32_t output_activation_min = params.quantized_activation_min;
1825 const int32_t output_activation_max = params.quantized_activation_max;
1826 const int32_t input_offset = params.input_offset;
1827 const int32_t filter_offset = params.weights_offset;
1828 const int32_t output_offset = params.output_offset;
1829 const int32_t output_multiplier = params.output_multiplier;
1830 const int output_shift = params.output_shift;
1831 const int dilation_width_factor = params.dilation_width_factor;
1832 const int dilation_height_factor = params.dilation_height_factor;
1833 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
1834 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1835 const int input_height = input_shape.Dims(1);
1836 const int input_width = input_shape.Dims(2);
1837 const int input_depth = input_shape.Dims(3);
1838 const int filter_height = filter_shape.Dims(1);
1839 const int filter_width = filter_shape.Dims(2);
1840 const int output_height = output_shape.Dims(1);
1841 const int output_width = output_shape.Dims(2);
1843 const bool shift_left = (output_shift > 0);
1844 const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
1847 static const int kAccBufferMaxSize = 2048;
1848 int32_t acc_buffer[kAccBufferMaxSize];
1849 assert(kAccBufferMaxSize >= output_depth);
1850 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1851 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1852 assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1853 assert(kAccBufferActualSize <= kAccBufferMaxSize);
1854 assert(kOutputPixelsInAccBuffer >= 1);
1855 UNUSED_RELEASE(kAccBufferActualSize);
1857 // row_accum_func will point to the core accumulation function to be used
1858 // for this DepthwiseConv op.
1859 using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
1860 row_accum_func_t row_accum_func = nullptr;
1862 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1863 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
1864 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
1865 depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
1868 QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
1872 // We go over our list of kernels by decreasing order of preference
1873 // for the cases where multiple kernels could apply.
1875 // Start with the fastest kernels: AllowStrided=false, fixed input depth.
1877 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
1878 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
1879 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
1880 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
1881 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
1882 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
1883 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
1884 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
1885 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
1886 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
1888 // Next come the strided kernels: AllowStrided=true, fixed input depth.
1889 // They are a bit less efficient, but allow stride!=1.
1891 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
1892 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
1893 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
1894 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
1895 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
1896 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
1897 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
1898 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
1899 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
1901 // Finally, the kernels allowing a variable input depth,
1902 // these are the least efficient but most general kernels.
1904 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
1905 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
1906 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
1909 // No matching fast kernel found, use slow fallback.
1910 if (!row_accum_func)
1912 row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
1915 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
1917 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
1918 const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1919 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1921 // Now that we have determined row_accum_func, we can start work.
1922 uint8_t *output_ptr = output_data;
1923 for (int b = 0; b < batches; ++b)
1925 for (int out_y = 0; out_y < output_height; ++out_y)
1927 const int in_y_origin = (out_y * stride_height) - pad_height;
1928 const int filter_y_start =
1929 std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1930 const int filter_y_end =
1931 std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1932 dilation_height_factor);
1933 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1934 out_x_buffer_start += kOutputPixelsInAccBuffer)
1936 const int out_x_buffer_end =
1937 std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1938 // We call a 'pixel' a group of activation that share all but the
1939 // 'depth'/'channel' coordinate. num_output_pixels is the number of
1940 // output pixels that we will accumulate in this loop iteration.
1941 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1942 // Initialize our local accumulator with the bias values, so we don't
1943 // have to add them later.
1944 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1945 // Accumulation loop. Most of the time should be spent in here.
1946 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1948 const int in_y = in_y_origin + dilation_height_factor * filter_y;
1949 row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1950 input_data + in_y * input_height_stride + b * input_batch_stride,
1951 input_offset, pad_width, depth_multiplier, filter_width,
1952 filter_data + filter_y * filter_height_stride, filter_offset,
1953 out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
1955 // Finished accumulating int32 values. Now need to convert them to
1956 // the final 8bit form and store them.
1957 const int num_output_values = output_depth * num_output_pixels;
1960 using gemmlowp::RoundingDivideByPOT;
1961 const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
1962 const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
1963 const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
1964 // Handle 16 values at once.
1965 // This allows us to issue 4 mutually independent int32
1966 // multiplications (vqrdmulh), which should alleviate most of their
1968 for (; i <= num_output_values - 16; i += 16)
1971 for (int j = 0; j < 4; j++)
1973 acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
1978 // Fixed-point multiplication.
1979 for (int j = 0; j < 4; j++)
1981 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
1983 for (int j = 0; j < 4; j++)
1985 acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
1990 // Fixed-point multiplication.
1991 for (int j = 0; j < 4; j++)
1993 acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
1994 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
1997 // Add the output offset.
1998 for (int j = 0; j < 4; j++)
2000 acc[j] = vaddq_s32(acc[j], output_offset_vec);
2002 // Apply the activation function.
2003 for (int j = 0; j < 4; j++)
2005 acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
2007 for (int j = 0; j < 4; j++)
2009 acc[j] = vminq_s32(acc[j], output_activation_max_vec);
2011 // Saturating cast to uint8_t and store to destination.
2012 int16x4_t acc_s16[4];
2013 for (int j = 0; j < 4; j++)
2015 acc_s16[j] = vqmovn_s32(acc[j]);
2017 const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
2018 const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
2019 const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
2020 const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
2021 vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
2024 // Handle 8 values at once.
2025 // Not as good as 16 (now we're only issuing 2 mutually independent
2026 // vqrdmulh instructions, so we're probably paying for their high
2028 for (; i <= num_output_values - 8; i += 8)
2030 int32x4_t acc0 = vld1q_s32(acc_buffer + i);
2031 int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
2034 // Fixed-point multiplication.
2035 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2036 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2037 // Rounding right shift.
2038 acc0 = RoundingDivideByPOT(acc0, -output_shift);
2039 acc1 = RoundingDivideByPOT(acc1, -output_shift);
2043 // Fixed-point multiplication.
2044 acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
2045 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2047 acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
2048 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2050 // Add the output offset.
2051 acc0 = vaddq_s32(acc0, output_offset_vec);
2052 acc1 = vaddq_s32(acc1, output_offset_vec);
2053 // Apply the activation function.
2054 acc0 = vmaxq_s32(acc0, output_activation_min_vec);
2055 acc1 = vmaxq_s32(acc1, output_activation_min_vec);
2056 acc0 = vminq_s32(acc0, output_activation_max_vec);
2057 acc1 = vminq_s32(acc1, output_activation_max_vec);
2058 // Saturating cast to uint8_t and store to destination.
2059 const int16x4_t acc0_s16 = vqmovn_s32(acc0);
2060 const int16x4_t acc1_s16 = vqmovn_s32(acc1);
2061 const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
2062 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2063 vst1_u8(output_ptr, res_u8);
2066 // Handle 4 values at once. Now we're paying the full price of the
2067 // high latency of vqrdmulh. Also, storing only 4 bytes at the end
2068 // (without any alignment) can only be done 1 byte at a time.
2069 // Yet, that is still worth doing to minimize the amount of leftover
2070 // that will have to go through the very slow scalar code.
2071 for (; i <= num_output_values - 4; i += 4)
2073 int32x4_t acc = vld1q_s32(acc_buffer + i);
2076 // Fixed-point multiplication.
2077 acc = vqrdmulhq_n_s32(acc, output_multiplier);
2078 // Rounding right shift.
2079 acc = RoundingDivideByPOT(acc, -output_shift);
2083 // Fixed-point multiplication.
2084 acc = vmulq_n_s32(acc, multiplier_power_of_two);
2085 acc = vqrdmulhq_n_s32(acc, output_multiplier);
2087 // Add the output offset.
2088 acc = vaddq_s32(acc, output_offset_vec);
2089 // Apply the activation function.
2090 acc = vmaxq_s32(acc, output_activation_min_vec);
2091 acc = vminq_s32(acc, output_activation_max_vec);
2092 // Saturating cast to uint8_t and store to destination.
2093 const int16x4_t acc_s16 = vqmovn_s32(acc);
2094 const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
2095 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2096 vst1_lane_u8(output_ptr + 0, res_u8, 0);
2097 vst1_lane_u8(output_ptr + 1, res_u8, 1);
2098 vst1_lane_u8(output_ptr + 2, res_u8, 2);
2099 vst1_lane_u8(output_ptr + 3, res_u8, 3);
2104 // Handle leftover values, one by one. This is very slow.
2105 for (; i < num_output_values; i++)
2107 int32_t acc = acc_buffer[i];
2108 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
2109 acc += output_offset;
2110 acc = std::max(acc, output_activation_min);
2111 acc = std::min(acc, output_activation_max);
2112 *output_ptr++ = static_cast<uint8_t>(acc);
2119 } // namespace optimized
2123 #endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__