2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
19 #define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
21 #include "cker/Shape.h"
22 #include "cker/Types.h"
23 #include "cker/Utils.h"
24 #include "cker/neon/neon_check.h"
33 // Implementation of float DepthwiseConv
35 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
36 struct FloatDepthwiseConvKernel
42 template <> struct FloatDepthwiseConvKernel<false, 8, 1>
44 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
45 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
46 float *acc_buffer_ptr)
49 (void)depth_multiplier;
50 (void)input_ptr_increment;
52 float32x4_t filter[2];
53 for (int i = 0; i < 2; i++)
55 filter[i] = vld1q_f32(filter_ptr + 4 * i);
58 // Handle 2 output pixels at a time.
59 for (; outp <= num_output_pixels - 2; outp += 2)
63 for (int i = 0; i < 4; i++)
65 input[i] = vld1q_f32(input_ptr + 4 * i);
68 // Load the accumulators from acc_buffer
70 for (int i = 0; i < 4; i++)
72 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
74 // Multiply-accumulate
75 acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
76 acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
77 acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
78 acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
79 // Store the accumulators back to acc_buffer
80 for (int i = 0; i < 4; i++)
82 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
86 // Handle one output pixel at a time.
87 for (; outp < num_output_pixels; outp++)
91 for (int i = 0; i < 2; i++)
93 input[i] = vld1q_f32(input_ptr + 4 * i);
96 // Load the accumulators from acc_buffer
98 for (int i = 0; i < 2; i++)
100 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
102 // Multiply-accumulate
103 for (int i = 0; i < 2; i++)
105 acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
107 // Store the accumulators back to acc_buffer
108 for (int i = 0; i < 2; i++)
110 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
117 template <> struct FloatDepthwiseConvKernel<false, 2, 1>
119 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
120 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
121 float *acc_buffer_ptr)
124 (void)depth_multiplier;
125 (void)input_ptr_increment;
127 const float32x2_t filters = vld1_f32(filter_ptr);
128 const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
130 // Handle 8 output pixels at a time.
131 for (; outp <= num_output_pixels - 8; outp += 8)
134 float32x4_t input[4];
135 for (int i = 0; i < 4; i++)
137 input[i] = vld1q_f32(input_ptr + 4 * i);
140 // Load the accumulators from acc_buffer
142 for (int i = 0; i < 4; i++)
144 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
146 // Multiply-accumulate
147 for (int i = 0; i < 4; i++)
149 acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
151 // Store the accumulators back to acc_buffer
152 for (int i = 0; i < 4; i++)
154 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
156 acc_buffer_ptr += 16;
158 // Handle 4 output pixels at a time.
159 for (; outp <= num_output_pixels - 4; outp += 4)
162 float32x4_t input[2];
163 for (int i = 0; i < 2; i++)
165 input[i] = vld1q_f32(input_ptr + 4 * i);
168 // Load the accumulators from acc_buffer
170 for (int i = 0; i < 2; i++)
172 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
174 // Multiply-accumulate
175 for (int i = 0; i < 2; i++)
177 acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
179 // Store the accumulators back to acc_buffer
180 for (int i = 0; i < 2; i++)
182 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
186 // Handle 2 output pixels at a time.
187 for (; outp <= num_output_pixels - 2; outp += 2)
190 const float32x4_t input = vld1q_f32(input_ptr);
192 // Load the accumulators from acc_buffer
193 float32x4_t acc = vld1q_f32(acc_buffer_ptr);
194 // Multiply-accumulate
195 acc = vmlaq_f32(acc, input, filters_dup2);
196 // Store the accumulators back to acc_buffer
197 vst1q_f32(acc_buffer_ptr, acc);
200 // Handle 1 output pixel at a time
201 for (; outp < num_output_pixels; outp++)
204 const float32x2_t input = vld1_f32(input_ptr);
206 // Load the accumulators from acc_buffer
207 float32x2_t acc = vld1_f32(acc_buffer_ptr);
208 // Multiply-accumulate
209 acc = vmla_f32(acc, input, filters);
210 // Store the accumulators back to acc_buffer
211 vst1_f32(acc_buffer_ptr, acc);
217 template <> struct FloatDepthwiseConvKernel<true, 0, 1>
219 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
220 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
221 float *acc_buffer_ptr)
223 (void)depth_multiplier;
225 // Handle one output pixel at a time.
226 for (int outp = 0; outp < num_output_pixels; outp++)
228 const float *local_filter_ptr = filter_ptr;
229 const float *local_input_ptr = input_ptr;
231 // Handle 16 input channels at a time.
232 for (; ic <= input_depth - 16; ic += 16)
235 float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
236 float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
237 float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
238 float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
239 local_filter_ptr += 16;
241 float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
242 float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
243 float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
244 float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
245 local_input_ptr += 16;
246 // Load the accumulators from acc_buffer
247 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
248 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
249 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
250 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
251 // Multiply-accumulate
252 acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
253 acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
254 acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
255 acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
256 // Store the accumulators back to acc_buffer
257 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
258 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
259 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
260 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
261 acc_buffer_ptr += 16;
263 // Handle 4 input channels at a time.
264 for (; ic <= input_depth - 4; ic += 4)
268 filter = vld1q_f32(local_filter_ptr);
269 local_filter_ptr += 4;
272 input = vld1q_f32(local_input_ptr);
273 local_input_ptr += 4;
274 // Load the accumulators from acc_buffer
276 acc = vld1q_f32(acc_buffer_ptr);
277 // Multiply-accumulate
278 acc = vmlaq_f32(acc, input, filter);
279 // Store the accumulators back to acc_buffer
280 vst1q_f32(acc_buffer_ptr, acc);
283 // Handle one input channel at a time.
284 for (; ic < input_depth; ic++)
286 const float input_val = *local_input_ptr++;
287 const float filter_val = *local_filter_ptr++;
288 *acc_buffer_ptr++ += filter_val * input_val;
290 input_ptr += input_ptr_increment;
295 template <> struct FloatDepthwiseConvKernel<true, 0, 8>
297 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
298 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
299 float *acc_buffer_ptr)
301 (void)depth_multiplier;
303 // Handle one output pixel at a time.
304 for (int outp = 0; outp < num_output_pixels; outp++)
306 const float *local_filter_ptr = filter_ptr;
307 const float *local_input_ptr = input_ptr;
309 // Handle 2 input channels at a time.
310 for (; ic <= input_depth - 2; ic += 2)
313 float32x4_t filter[4];
314 for (int i = 0; i < 4; i++)
316 filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
318 local_filter_ptr += 16;
320 const float32x2_t input = vld1_f32(local_input_ptr);
321 local_input_ptr += 2;
322 // Load the accumulators from acc_buffer
324 for (int i = 0; i < 4; i++)
326 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
328 // Multiply-accumulate
329 acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
330 acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
331 acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
332 acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
333 // Store the accumulators back to acc_buffer
334 for (int i = 0; i < 4; i++)
336 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
338 acc_buffer_ptr += 16;
340 // Handle one input channel at a time.
341 for (; ic < input_depth; ic++)
344 float32x4_t filter[2];
345 for (int i = 0; i < 2; i++)
347 filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
349 local_filter_ptr += 8;
351 const float input_val = *local_input_ptr++;
352 // Load the accumulators from acc_buffer
354 for (int i = 0; i < 2; i++)
356 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
358 // Multiply-accumulate
359 for (int i = 0; i < 2; i++)
361 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
363 // Store the accumulators back to acc_buffer
364 for (int i = 0; i < 2; i++)
366 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
370 input_ptr += input_ptr_increment;
375 // Note this implementation is very slow for input_depths < 8
376 // (e.g. comparable to reference implementation) see, specializations for
377 // input_depth=3 below.
378 template <> struct FloatDepthwiseConvKernel<true, 0, 2>
380 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
381 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
382 float *acc_buffer_ptr)
384 (void)depth_multiplier;
386 // Handle one output pixel at a time.
387 for (int outp = 0; outp < num_output_pixels; outp++)
389 const float *local_filter_ptr = filter_ptr;
390 const float *local_input_ptr = input_ptr;
392 // Handle 8 input channels at a time.
393 for (; ic <= input_depth - 8; ic += 8)
396 float32x4_t filter[4];
397 for (int i = 0; i < 4; i++)
399 filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
401 local_filter_ptr += 16;
403 float32x4x2_t input_dup2[2];
404 for (int i = 0; i < 2; i++)
406 const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
407 input_dup2[i] = vzipq_f32(input, input);
409 local_input_ptr += 8;
410 // Load the accumulators from acc_buffer
412 for (int i = 0; i < 4; i++)
414 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
416 // Multiply-accumulate
417 acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
418 acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
419 acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
420 acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
421 // Store the accumulators back to acc_buffer
422 for (int i = 0; i < 4; i++)
424 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
426 acc_buffer_ptr += 16;
428 // Handle 4 input channels at a time.
429 for (; ic <= input_depth - 4; ic += 4)
432 float32x2_t filter[4];
433 for (int i = 0; i < 4; i++)
435 filter[i] = vld1_f32(local_filter_ptr + 2 * i);
437 local_filter_ptr += 8;
439 const float32x4_t input = vld1q_f32(local_input_ptr);
440 local_input_ptr += 4;
441 // Load the accumulators from acc_buffer
443 for (int i = 0; i < 4; i++)
445 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
447 // Multiply-accumulate
448 acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
449 acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
450 acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
451 acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
452 // Store the accumulators back to acc_buffer
453 for (int i = 0; i < 4; i++)
455 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
459 // Handle 2 input channels at a time.
460 for (; ic <= input_depth - 2; ic += 2)
463 const float32x4_t filter = vld1q_f32(local_filter_ptr);
464 local_filter_ptr += 4;
466 const float32x2_t input = vld1_f32(local_input_ptr);
467 local_input_ptr += 2;
468 // Load the accumulators from acc_buffer
470 for (int i = 0; i < 2; i++)
472 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
474 // Multiply-accumulate
475 acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
476 acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
477 // Store the accumulators back to acc_buffer
478 for (int i = 0; i < 2; i++)
480 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
484 // Handle one input channel at a time.
485 for (; ic < input_depth; ic++)
488 const float input_val = *local_input_ptr++;
489 // Multiply-accumulate
490 for (int i = 0; i < 2; i++)
492 acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
494 local_filter_ptr += 2;
497 input_ptr += input_ptr_increment;
502 template <> struct FloatDepthwiseConvKernel<true, 3, 2>
504 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
505 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
506 float *acc_buffer_ptr)
509 (void)depth_multiplier;
512 float32x2_t filter[3];
513 for (int i = 0; i < 3; i++)
515 filter[i] = vld1_f32(filter_ptr + 2 * i);
517 // Handle one output pixel at a time.
518 for (int outp = 0; outp < num_output_pixels; outp++)
520 const float32x2_t input01 = vld1_f32(input_ptr);
521 const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
522 // Load the accumulators from acc_buffer
524 for (int i = 0; i < 3; i++)
526 acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
528 // Multiply-accumulate for each input channel there 2 outputs
529 acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
530 acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
531 acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
532 // Store the accumulators back to acc_buffer
533 for (int i = 0; i < 3; i++)
535 vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
538 input_ptr += input_ptr_increment;
543 template <> struct FloatDepthwiseConvKernel<true, 3, 4>
545 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
546 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
547 float *acc_buffer_ptr)
550 (void)depth_multiplier;
553 float32x4_t filter[3];
554 for (int i = 0; i < 3; i++)
556 filter[i] = vld1q_f32(filter_ptr + 4 * i);
558 // Handle one output pixel at a time.
559 for (int outp = 0; outp < num_output_pixels; outp++)
561 // NOTE: we only want 3 values, so we read it as two ops where
562 // the second op just duplicates the lane
563 const float32x2_t input01 = vld1_f32(input_ptr);
564 const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
565 // Load the accumulators from acc_buffer
567 for (int i = 0; i < 3; i++)
569 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
571 // Multiply-accumulate all outputs.
572 acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
573 acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
574 acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
575 // Store the accumulators back to acc_buffer
576 for (int i = 0; i < 3; i++)
578 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
580 acc_buffer_ptr += 12;
581 input_ptr += input_ptr_increment;
586 template <> struct FloatDepthwiseConvKernel<true, 1, 8>
588 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
589 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
590 float *acc_buffer_ptr)
593 (void)depth_multiplier;
596 float32x4_t filter[2];
597 for (int i = 0; i < 2; i++)
599 filter[i] = vld1q_f32(filter_ptr + 4 * i);
601 // Handle one output pixel at a time.
602 for (int outp = 0; outp < num_output_pixels; outp++)
605 const float input_val = *input_ptr;
606 input_ptr += input_ptr_increment;
607 // Load the accumulators from acc_buffer
609 for (int i = 0; i < 2; i++)
611 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
613 // Multiply-accumulate
614 for (int i = 0; i < 2; i++)
616 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
618 // Store the accumulators back to acc_buffer
619 for (int i = 0; i < 2; i++)
621 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
628 template <> struct FloatDepthwiseConvKernel<true, 1, 32>
630 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
631 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
632 float *acc_buffer_ptr)
635 (void)depth_multiplier;
638 float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
639 float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
640 float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
641 float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
642 float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
643 float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
644 float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
645 float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
647 // Handle one output pixel at a time.
648 for (int outp = 0; outp < num_output_pixels; outp++)
651 const float input_val = *input_ptr;
652 input_ptr += input_ptr_increment;
653 // Load the accumulators from acc_buffer
654 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
655 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
656 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
657 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
658 float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
659 float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
660 float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
661 float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
662 // Multiply-accumulate
663 acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
664 acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
665 acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
666 acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
667 acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
668 acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
669 acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
670 acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
671 // Store the accumulators back to acc_buffer
672 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
673 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
674 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
675 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
676 vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
677 vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
678 vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
679 vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
680 acc_buffer_ptr += 32;
685 template <> struct FloatDepthwiseConvKernel<true, 1, 20>
687 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
688 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
689 float *acc_buffer_ptr)
692 (void)depth_multiplier;
695 float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
696 float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
697 float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
698 float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
699 float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
701 // Handle one output pixel at a time.
702 for (int outp = 0; outp < num_output_pixels; outp++)
705 const float input_val = *input_ptr;
706 input_ptr += input_ptr_increment;
707 // Load the accumulators from acc_buffer
708 float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
709 float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
710 float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
711 float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
712 float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
713 // Multiply-accumulate
714 acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
715 acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
716 acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
717 acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
718 acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
719 // Store the accumulators back to acc_buffer
720 vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
721 vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
722 vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
723 vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
724 vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
725 acc_buffer_ptr += 20;
730 template <> struct FloatDepthwiseConvKernel<true, 0, 16>
732 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
733 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
734 float *acc_buffer_ptr)
736 (void)depth_multiplier;
738 // Handle one output pixel at a time.
739 for (int outp = 0; outp < num_output_pixels; outp++)
741 const float *local_filter_ptr = filter_ptr;
742 const float *local_input_ptr = input_ptr;
743 for (int ic = 0; ic < input_depth; ic++)
746 float32x4_t filter[4];
747 for (int i = 0; i < 4; i++)
749 filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
751 local_filter_ptr += 16;
753 const float input_val = *local_input_ptr++;
754 // Load the accumulators from acc_buffer
756 for (int i = 0; i < 4; i++)
758 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
760 // Multiply-accumulate
761 for (int i = 0; i < 4; i++)
763 acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
765 // Store the accumulators back to acc_buffer
766 for (int i = 0; i < 4; i++)
768 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
770 acc_buffer_ptr += 16;
772 input_ptr += input_ptr_increment;
777 template <> struct FloatDepthwiseConvKernel<true, 8, 1>
779 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
780 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
781 float *acc_buffer_ptr)
784 (void)depth_multiplier;
787 float32x4_t filter[2];
788 for (int i = 0; i < 2; i++)
790 filter[i] = vld1q_f32(filter_ptr + 4 * i);
792 // Handle one output pixel at a time.
793 for (int outp = 0; outp < num_output_pixels; outp++)
796 float32x4_t input[2];
797 for (int i = 0; i < 2; i++)
799 input[i] = vld1q_f32(input_ptr + 4 * i);
801 // Load the accumulators from acc_buffer
803 for (int i = 0; i < 2; i++)
805 acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
807 // Multiply-accumulate
808 for (int i = 0; i < 2; i++)
810 acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
812 // Store the accumulators back to acc_buffer
813 for (int i = 0; i < 2; i++)
815 vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
818 input_ptr += input_ptr_increment;
823 template <> struct FloatDepthwiseConvKernel<true, 2, 1>
825 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
826 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
827 float *acc_buffer_ptr)
830 (void)depth_multiplier;
832 float32x2_t filter = vld1_f32(filter_ptr);
833 float32x4_t filter_x4 = vcombine_f32(filter, filter);
836 // Handle two output pixels at a time.
837 for (; outp <= num_output_pixels - 2; outp += 2)
840 float32x2_t input_1 = vld1_f32(input_ptr);
841 input_ptr += input_ptr_increment;
842 float32x2_t input_2 = vld1_f32(input_ptr);
843 input_ptr += input_ptr_increment;
844 float32x4_t input = vcombine_f32(input_1, input_2);
846 // Load the accumulators from acc_buffer
847 float32x4_t acc = vld1q_f32(acc_buffer_ptr);
849 // Multiply-accumulate
850 acc = vmlaq_f32(acc, input, filter_x4);
852 // Store the accumulators back to acc_buffer
853 vst1q_f32(acc_buffer_ptr, acc);
856 // Handle one output pixel at a time.
857 for (; outp < num_output_pixels; outp++)
860 float32x2_t input = vld1_f32(input_ptr);
861 input_ptr += input_ptr_increment;
863 // Load the accumulators from acc_buffer
864 float32x2_t acc = vld1_f32(acc_buffer_ptr);
866 // Multiply-accumulate
867 acc = vmla_f32(acc, input, filter);
869 // Store the accumulators back to acc_buffer
870 vst1_f32(acc_buffer_ptr, acc);
876 template <> struct FloatDepthwiseConvKernel<true, 4, 1>
878 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
879 const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
880 float *acc_buffer_ptr)
883 (void)depth_multiplier;
885 float32x4_t filter = vld1q_f32(filter_ptr);
887 // Handle one output pixel at a time.
888 for (int outp = 0; outp < num_output_pixels; outp++)
891 float32x4_t input = vld1q_f32(input_ptr);
892 // Load the accumulators from acc_buffer
893 float32x4_t acc = vld1q_f32(acc_buffer_ptr);
894 // Multiply-accumulate
895 acc = vmlaq_f32(acc, input, filter);
896 // Store the accumulators back to acc_buffer
897 vst1q_f32(acc_buffer_ptr, acc);
899 input_ptr += input_ptr_increment;
905 // Accumulates the effect of one row of the filter, on a segment of one row
906 // of the output, accessing the corresponding one row of the input.
907 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
908 void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width,
909 const float *input_data, int pad_width, int depth_multiplier,
910 int filter_width, const float *filter_data, int out_x_buffer_start,
911 int out_x_buffer_end, int output_depth, float *acc_buffer)
913 // Sanity check parameters. This is important in particular to ensure
914 // that we keep the number of template instantiations minimal, so we don't
915 // increase binary size unnecessarily.
916 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
917 static_assert(kFixedInputDepth || kAllowStrided, "");
918 assert(stride == 1 || kAllowStrided);
919 if (kFixedInputDepth)
921 assert(input_depth == kFixedInputDepth);
923 if (kFixedDepthMultiplier)
925 assert(depth_multiplier == kFixedDepthMultiplier);
927 assert(output_depth == input_depth * depth_multiplier);
928 const int input_ptr_increment = stride * input_depth;
929 const float *filter_base_ptr = filter_data;
930 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
932 // For the current (filter_x, filter_y) point in the filter,
933 // compute the boundaries of the corresponding output row segment.
934 int out_x_loop_start_unclamped = 0;
935 int out_x_loop_end_unclamped = 0;
940 out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
941 out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
943 else if (stride == 4)
945 out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
946 out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
950 out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
951 out_x_loop_end_unclamped =
952 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
957 out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
958 out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
960 // The kernel will have to iterate on the segment of the
961 // output row that starts at out_x_loop_start and out_x_loop_end.
962 const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
963 const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
965 float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
966 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
967 const float *input_ptr = input_data + in_x_origin * input_depth;
968 const int num_output_pixels = out_x_loop_end - out_x_loop_start;
969 FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
970 num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment,
971 filter_base_ptr, acc_buffer_ptr);
972 filter_base_ptr += output_depth;
976 // generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
977 inline void FloatDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
978 int input_width, const float *input_data,
979 int pad_width, int depth_multiplier, int filter_width,
980 const float *filter_data, int out_x_buffer_start,
981 int out_x_buffer_end, int output_depth,
984 const float *filter_base_ptr = filter_data;
985 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
987 const int out_x_loop_start =
988 std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
989 const int out_x_loop_end =
990 std::min(out_x_buffer_end,
991 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
993 float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
994 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
995 const float *input_ptr = input_data + in_x_origin * input_depth;
996 const int input_ptr_increment = (stride - 1) * input_depth;
997 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
999 const float *filter_ptr = filter_base_ptr;
1000 for (int ic = 0; ic < input_depth; ++ic)
1002 const float input_val = *input_ptr++;
1003 for (int m = 0; m < depth_multiplier; m++)
1005 const float filter_val = *filter_ptr++;
1006 *acc_buffer_ptr++ += filter_val * input_val;
1009 input_ptr += input_ptr_increment;
1011 filter_base_ptr += output_depth;
1015 // Initializes the accumulator buffer with bias values.
1016 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
1017 const float *bias_data, float *acc_buffer)
1019 // TODO(benoitjacob): This might need optimized specializations
1020 // for small output_depth values, if that ever becomes an important
1021 // case (like it was for some quantized DepthwiseConv cases).
1022 for (int i = 0; i < num_output_pixels; i++)
1024 memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
1028 // DepthwiseConv can run with multi threads on the dim specified by thread_dim.
1029 // Each thread processes output elements on dim, thread_dim, in the range of
1030 // [thread_start, thread_end).
1031 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
1032 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
1033 inline void DepthwiseConvImpl(const DepthwiseConvParams ¶ms, const Shape &input_shape,
1034 const float *input_data, const Shape &filter_shape,
1035 const float *filter_data, const Shape &bias_shape,
1036 const float *bias_data, const Shape &output_shape, float *output_data,
1037 int thread_start, int thread_end, int thread_dim)
1039 UNUSED_RELEASE(bias_shape);
1040 const int stride_width = params.stride_width;
1041 const int stride_height = params.stride_height;
1042 const int pad_width = params.padding_values.width;
1043 const int pad_height = params.padding_values.height;
1044 const int depth_multiplier = params.depth_multiplier;
1045 const float output_activation_min = params.float_activation_min;
1046 const float output_activation_max = params.float_activation_max;
1047 const int dilation_width_factor = params.dilation_width_factor;
1048 const int dilation_height_factor = params.dilation_height_factor;
1049 assert(input_shape.DimensionsCount() == 4);
1050 assert(filter_shape.DimensionsCount() == 4);
1051 assert(output_shape.DimensionsCount() == 4);
1052 assert(thread_dim == 0 || thread_dim == 1);
1054 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
1055 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1056 const int input_height = input_shape.Dims(1);
1057 const int input_width = input_shape.Dims(2);
1058 const int input_depth = input_shape.Dims(3);
1059 const int filter_height = filter_shape.Dims(1);
1060 const int filter_width = filter_shape.Dims(2);
1061 const int output_height = output_shape.Dims(1);
1062 const int output_width = output_shape.Dims(2);
1063 assert(output_depth == input_depth * depth_multiplier);
1064 assert(bias_shape.FlatSize() == output_depth);
1066 static const int kAccBufferMaxSize = 4832;
1067 float acc_buffer[kAccBufferMaxSize];
1068 assert(kAccBufferMaxSize >= output_depth);
1069 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1070 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1071 assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1072 assert(kAccBufferActualSize <= kAccBufferMaxSize);
1073 assert(kOutputPixelsInAccBuffer >= 1);
1075 UNUSED_RELEASE(kAccBufferActualSize);
1077 // row_accum_func will point to the core accumulation function to be used
1078 // for this DepthwiseConv op.
1079 using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
1080 row_accum_func_t row_accum_func = nullptr;
1082 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1083 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
1084 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
1085 depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
1088 FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
1092 // We go over our list of kernels by decreasing order of preference
1093 // for the cases where multiple kernels could apply.
1095 // Start with the fastest kernels: AllowStrided=false, fixed input depth.
1097 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
1098 TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
1100 // Next come the strided kernels: AllowStrided=true, fixed input depth.
1101 // They are a bit less efficient, but allow stride!=1.
1103 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
1104 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
1105 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
1106 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
1107 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
1108 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
1109 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
1110 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
1112 // Finally, the kernels allowing a variable input depth,
1113 // these are the least efficient but most general kernels.
1115 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
1116 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
1117 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
1118 TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
1122 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
1124 // No matching fast kernel found, use slow fallback.
1125 if (!row_accum_func)
1127 row_accum_func = FloatDepthwiseConvAccumRowGeneric;
1130 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
1131 const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1132 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1134 // Now that we have determined row_accum_func, we can start work.
1135 int batch_start = 0;
1136 int batch_end = batches;
1138 int row_end = output_height;
1139 int output_ptr_offset = 0;
1144 // Multithread along with the batch axis
1145 assert(thread_start >= 0);
1146 assert(thread_end <= batches);
1147 batch_start = thread_start;
1148 batch_end = thread_end;
1149 output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
1152 // Multithread along with the row axis
1153 assert(thread_start >= 0);
1154 assert(thread_end <= output_height);
1155 row_start = thread_start;
1156 row_end = thread_end;
1157 output_ptr_offset = row_start * output_width * output_depth;
1161 float *output_ptr = output_data + output_ptr_offset;
1162 int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
1164 for (int b = batch_start; b < batch_end; ++b)
1166 for (int out_y = row_start; out_y < row_end; ++out_y)
1168 const int in_y_origin = (out_y * stride_height) - pad_height;
1169 const int filter_y_start =
1170 std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1171 const int filter_y_end =
1172 std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1173 dilation_height_factor);
1174 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1175 out_x_buffer_start += kOutputPixelsInAccBuffer)
1177 const int out_x_buffer_end =
1178 std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1179 // We call a 'pixel' a group of activation that share all but the
1180 // 'depth'/'channel' coordinate. num_output_pixels is the number of
1181 // output pixels that we will accumulate in this loop iteration.
1182 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1183 // Initialize our local accumulator with the bias values, so we don't
1184 // have to add them later.
1185 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1186 // Accumulation loop. Most of the time should be spent in here.
1187 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1189 const int in_y = in_y_origin + dilation_height_factor * filter_y;
1190 row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1191 input_data + in_y * input_height_stride + b * input_batch_stride,
1192 pad_width, depth_multiplier, filter_width,
1193 filter_data + filter_y * filter_height_stride, out_x_buffer_start,
1194 out_x_buffer_end, output_depth, acc_buffer);
1196 // Finished accumulating. Now store to destination.
1197 const int num_output_values = output_depth * num_output_pixels;
1199 // TODO(benoitjacob) optimized code goes here
1201 // Handle 16 values at a time
1202 for (; i <= num_output_values - 16; i += 16)
1205 for (int k = 0; k < 4; k++)
1207 acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
1209 for (int k = 0; k < 4; k++)
1211 acc[k] = vmaxq_f32(vdupq_n_f32(output_activation_min),
1212 vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
1214 for (int k = 0; k < 4; k++)
1216 vst1q_f32(output_ptr + 4 * k, acc[k]);
1220 // Handle 4 values at a time
1221 for (; i <= num_output_values - 4; i += 4)
1223 float32x4_t acc = vld1q_f32(acc_buffer + i);
1225 acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
1226 vminq_f32(vdupq_n_f32(output_activation_max), acc));
1228 vst1q_f32(output_ptr, acc);
1232 // Handle leftover values, one by one. This is very slow.
1233 for (; i < num_output_values; i++)
1235 float acc = acc_buffer[i];
1236 acc = std::max(output_activation_min, std::min(output_activation_max, acc));
1238 *output_ptr++ = acc;
1242 output_ptr += batch_step;