compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
  19 #define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
  20
  21 #include "cker/Shape.h"
  22 #include "cker/Types.h"
  23 #include "cker/Utils.h"
  24 #include "cker/neon/neon_check.h"
  25
  26 namespace nnfw
  27 {
  28 namespace cker
  29 {
  30 namespace optimized
  31 {
  32
  33 // Implementation of float DepthwiseConv
  34
  35 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
  36 struct FloatDepthwiseConvKernel
  37 {
  38 };
  39
  40 #ifdef USE_NEON
  41
  42 template <> struct FloatDepthwiseConvKernel<false, 8, 1>
  43 {
  44   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
  45                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
  46                   float *acc_buffer_ptr)
  47   {
  48     (void)input_depth;
  49     (void)depth_multiplier;
  50     (void)input_ptr_increment;
  51     // Load the filters
  52     float32x4_t filter[2];
  53     for (int i = 0; i < 2; i++)
  54     {
  55       filter[i] = vld1q_f32(filter_ptr + 4 * i);
  56     }
  57     int outp = 0;
  58     // Handle 2 output pixels at a time.
  59     for (; outp <= num_output_pixels - 2; outp += 2)
  60     {
  61       // Load the inputs
  62       float32x4_t input[4];
  63       for (int i = 0; i < 4; i++)
  64       {
  65         input[i] = vld1q_f32(input_ptr + 4 * i);
  66       }
  67       input_ptr += 16;
  68       // Load the accumulators from acc_buffer
  69       float32x4_t acc[4];
  70       for (int i = 0; i < 4; i++)
  71       {
  72         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
  73       }
  74       // Multiply-accumulate
  75       acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
  76       acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
  77       acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
  78       acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
  79       // Store the accumulators back to acc_buffer
  80       for (int i = 0; i < 4; i++)
  81       {
  82         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
  83       }
  84       acc_buffer_ptr += 16;
  85     }
  86     // Handle one output pixel at a time.
  87     for (; outp < num_output_pixels; outp++)
  88     {
  89       // Load the inputs
  90       float32x4_t input[2];
  91       for (int i = 0; i < 2; i++)
  92       {
  93         input[i] = vld1q_f32(input_ptr + 4 * i);
  94       }
  95       input_ptr += 8;
  96       // Load the accumulators from acc_buffer
  97       float32x4_t acc[2];
  98       for (int i = 0; i < 2; i++)
  99       {
 100         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 101       }
 102       // Multiply-accumulate
 103       for (int i = 0; i < 2; i++)
 104       {
 105         acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
 106       }
 107       // Store the accumulators back to acc_buffer
 108       for (int i = 0; i < 2; i++)
 109       {
 110         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 111       }
 112       acc_buffer_ptr += 8;
 113     }
 114   }
 115 };
 116
 117 template <> struct FloatDepthwiseConvKernel<false, 2, 1>
 118 {
 119   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 120                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 121                   float *acc_buffer_ptr)
 122   {
 123     (void)input_depth;
 124     (void)depth_multiplier;
 125     (void)input_ptr_increment;
 126
 127     const float32x2_t filters = vld1_f32(filter_ptr);
 128     const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
 129     int outp = 0;
 130     // Handle 8 output pixels at a time.
 131     for (; outp <= num_output_pixels - 8; outp += 8)
 132     {
 133       // Load the inputs
 134       float32x4_t input[4];
 135       for (int i = 0; i < 4; i++)
 136       {
 137         input[i] = vld1q_f32(input_ptr + 4 * i);
 138       }
 139       input_ptr += 16;
 140       // Load the accumulators from acc_buffer
 141       float32x4_t acc[4];
 142       for (int i = 0; i < 4; i++)
 143       {
 144         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 145       }
 146       // Multiply-accumulate
 147       for (int i = 0; i < 4; i++)
 148       {
 149         acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
 150       }
 151       // Store the accumulators back to acc_buffer
 152       for (int i = 0; i < 4; i++)
 153       {
 154         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 155       }
 156       acc_buffer_ptr += 16;
 157     }
 158     // Handle 4 output pixels at a time.
 159     for (; outp <= num_output_pixels - 4; outp += 4)
 160     {
 161       // Load the inputs
 162       float32x4_t input[2];
 163       for (int i = 0; i < 2; i++)
 164       {
 165         input[i] = vld1q_f32(input_ptr + 4 * i);
 166       }
 167       input_ptr += 8;
 168       // Load the accumulators from acc_buffer
 169       float32x4_t acc[2];
 170       for (int i = 0; i < 2; i++)
 171       {
 172         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 173       }
 174       // Multiply-accumulate
 175       for (int i = 0; i < 2; i++)
 176       {
 177         acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
 178       }
 179       // Store the accumulators back to acc_buffer
 180       for (int i = 0; i < 2; i++)
 181       {
 182         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 183       }
 184       acc_buffer_ptr += 8;
 185     }
 186     // Handle 2 output pixels at a time.
 187     for (; outp <= num_output_pixels - 2; outp += 2)
 188     {
 189       // Load the inputs
 190       const float32x4_t input = vld1q_f32(input_ptr);
 191       input_ptr += 4;
 192       // Load the accumulators from acc_buffer
 193       float32x4_t acc = vld1q_f32(acc_buffer_ptr);
 194       // Multiply-accumulate
 195       acc = vmlaq_f32(acc, input, filters_dup2);
 196       // Store the accumulators back to acc_buffer
 197       vst1q_f32(acc_buffer_ptr, acc);
 198       acc_buffer_ptr += 4;
 199     }
 200     // Handle 1 output pixel at a time
 201     for (; outp < num_output_pixels; outp++)
 202     {
 203       // Load the inputs
 204       const float32x2_t input = vld1_f32(input_ptr);
 205       input_ptr += 2;
 206       // Load the accumulators from acc_buffer
 207       float32x2_t acc = vld1_f32(acc_buffer_ptr);
 208       // Multiply-accumulate
 209       acc = vmla_f32(acc, input, filters);
 210       // Store the accumulators back to acc_buffer
 211       vst1_f32(acc_buffer_ptr, acc);
 212       acc_buffer_ptr += 2;
 213     }
 214   }
 215 };
 216
 217 template <> struct FloatDepthwiseConvKernel<true, 0, 1>
 218 {
 219   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 220                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 221                   float *acc_buffer_ptr)
 222   {
 223     (void)depth_multiplier;
 224
 225     // Handle one output pixel at a time.
 226     for (int outp = 0; outp < num_output_pixels; outp++)
 227     {
 228       const float *local_filter_ptr = filter_ptr;
 229       const float *local_input_ptr = input_ptr;
 230       int ic = 0;
 231       // Handle 16 input channels at a time.
 232       for (; ic <= input_depth - 16; ic += 16)
 233       {
 234         // Load the filters
 235         float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
 236         float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
 237         float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
 238         float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
 239         local_filter_ptr += 16;
 240         // Load the inputs
 241         float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
 242         float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
 243         float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
 244         float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
 245         local_input_ptr += 16;
 246         // Load the accumulators from acc_buffer
 247         float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
 248         float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
 249         float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
 250         float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
 251         // Multiply-accumulate
 252         acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
 253         acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
 254         acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
 255         acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
 256         // Store the accumulators back to acc_buffer
 257         vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
 258         vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
 259         vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
 260         vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
 261         acc_buffer_ptr += 16;
 262       }
 263       // Handle 4 input channels at a time.
 264       for (; ic <= input_depth - 4; ic += 4)
 265       {
 266         // Load the filters
 267         float32x4_t filter;
 268         filter = vld1q_f32(local_filter_ptr);
 269         local_filter_ptr += 4;
 270         // Load the inputs
 271         float32x4_t input;
 272         input = vld1q_f32(local_input_ptr);
 273         local_input_ptr += 4;
 274         // Load the accumulators from acc_buffer
 275         float32x4_t acc;
 276         acc = vld1q_f32(acc_buffer_ptr);
 277         // Multiply-accumulate
 278         acc = vmlaq_f32(acc, input, filter);
 279         // Store the accumulators back to acc_buffer
 280         vst1q_f32(acc_buffer_ptr, acc);
 281         acc_buffer_ptr += 4;
 282       }
 283       // Handle one input channel at a time.
 284       for (; ic < input_depth; ic++)
 285       {
 286         const float input_val = *local_input_ptr++;
 287         const float filter_val = *local_filter_ptr++;
 288         *acc_buffer_ptr++ += filter_val * input_val;
 289       }
 290       input_ptr += input_ptr_increment;
 291     }
 292   }
 293 };
 294
 295 template <> struct FloatDepthwiseConvKernel<true, 0, 8>
 296 {
 297   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 298                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 299                   float *acc_buffer_ptr)
 300   {
 301     (void)depth_multiplier;
 302
 303     // Handle one output pixel at a time.
 304     for (int outp = 0; outp < num_output_pixels; outp++)
 305     {
 306       const float *local_filter_ptr = filter_ptr;
 307       const float *local_input_ptr = input_ptr;
 308       int ic = 0;
 309       // Handle 2 input channels at a time.
 310       for (; ic <= input_depth - 2; ic += 2)
 311       {
 312         // Load the filters
 313         float32x4_t filter[4];
 314         for (int i = 0; i < 4; i++)
 315         {
 316           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
 317         }
 318         local_filter_ptr += 16;
 319         // Load the inputs
 320         const float32x2_t input = vld1_f32(local_input_ptr);
 321         local_input_ptr += 2;
 322         // Load the accumulators from acc_buffer
 323         float32x4_t acc[4];
 324         for (int i = 0; i < 4; i++)
 325         {
 326           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 327         }
 328         // Multiply-accumulate
 329         acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
 330         acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
 331         acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
 332         acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
 333         // Store the accumulators back to acc_buffer
 334         for (int i = 0; i < 4; i++)
 335         {
 336           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 337         }
 338         acc_buffer_ptr += 16;
 339       }
 340       // Handle one input channel at a time.
 341       for (; ic < input_depth; ic++)
 342       {
 343         // Load the filters
 344         float32x4_t filter[2];
 345         for (int i = 0; i < 2; i++)
 346         {
 347           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
 348         }
 349         local_filter_ptr += 8;
 350         // Load the inputs
 351         const float input_val = *local_input_ptr++;
 352         // Load the accumulators from acc_buffer
 353         float32x4_t acc[2];
 354         for (int i = 0; i < 2; i++)
 355         {
 356           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 357         }
 358         // Multiply-accumulate
 359         for (int i = 0; i < 2; i++)
 360         {
 361           acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
 362         }
 363         // Store the accumulators back to acc_buffer
 364         for (int i = 0; i < 2; i++)
 365         {
 366           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 367         }
 368         acc_buffer_ptr += 8;
 369       }
 370       input_ptr += input_ptr_increment;
 371     }
 372   }
 373 };
 374
 375 // Note this implementation is very slow for input_depths < 8
 376 // (e.g. comparable to reference implementation) see, specializations for
 377 // input_depth=3 below.
 378 template <> struct FloatDepthwiseConvKernel<true, 0, 2>
 379 {
 380   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 381                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 382                   float *acc_buffer_ptr)
 383   {
 384     (void)depth_multiplier;
 385
 386     // Handle one output pixel at a time.
 387     for (int outp = 0; outp < num_output_pixels; outp++)
 388     {
 389       const float *local_filter_ptr = filter_ptr;
 390       const float *local_input_ptr = input_ptr;
 391       int ic = 0;
 392       // Handle 8 input channels at a time.
 393       for (; ic <= input_depth - 8; ic += 8)
 394       {
 395         // Load the filters
 396         float32x4_t filter[4];
 397         for (int i = 0; i < 4; i++)
 398         {
 399           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
 400         }
 401         local_filter_ptr += 16;
 402         // Load the inputs
 403         float32x4x2_t input_dup2[2];
 404         for (int i = 0; i < 2; i++)
 405         {
 406           const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
 407           input_dup2[i] = vzipq_f32(input, input);
 408         }
 409         local_input_ptr += 8;
 410         // Load the accumulators from acc_buffer
 411         float32x4_t acc[4];
 412         for (int i = 0; i < 4; i++)
 413         {
 414           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 415         }
 416         // Multiply-accumulate
 417         acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
 418         acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
 419         acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
 420         acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
 421         // Store the accumulators back to acc_buffer
 422         for (int i = 0; i < 4; i++)
 423         {
 424           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 425         }
 426         acc_buffer_ptr += 16;
 427       }
 428       // Handle 4 input channels at a time.
 429       for (; ic <= input_depth - 4; ic += 4)
 430       {
 431         // Load the filters
 432         float32x2_t filter[4];
 433         for (int i = 0; i < 4; i++)
 434         {
 435           filter[i] = vld1_f32(local_filter_ptr + 2 * i);
 436         }
 437         local_filter_ptr += 8;
 438         // Load the inputs
 439         const float32x4_t input = vld1q_f32(local_input_ptr);
 440         local_input_ptr += 4;
 441         // Load the accumulators from acc_buffer
 442         float32x2_t acc[4];
 443         for (int i = 0; i < 4; i++)
 444         {
 445           acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
 446         }
 447         // Multiply-accumulate
 448         acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
 449         acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
 450         acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
 451         acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
 452         // Store the accumulators back to acc_buffer
 453         for (int i = 0; i < 4; i++)
 454         {
 455           vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
 456         }
 457         acc_buffer_ptr += 8;
 458       }
 459       // Handle 2 input channels at a time.
 460       for (; ic <= input_depth - 2; ic += 2)
 461       {
 462         // Load the filters
 463         const float32x4_t filter = vld1q_f32(local_filter_ptr);
 464         local_filter_ptr += 4;
 465         // Load the inputs
 466         const float32x2_t input = vld1_f32(local_input_ptr);
 467         local_input_ptr += 2;
 468         // Load the accumulators from acc_buffer
 469         float32x2_t acc[2];
 470         for (int i = 0; i < 2; i++)
 471         {
 472           acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
 473         }
 474         // Multiply-accumulate
 475         acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
 476         acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
 477         // Store the accumulators back to acc_buffer
 478         for (int i = 0; i < 2; i++)
 479         {
 480           vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
 481         }
 482         acc_buffer_ptr += 4;
 483       }
 484       // Handle one input channel at a time.
 485       for (; ic < input_depth; ic++)
 486       {
 487         // Load the inputs
 488         const float input_val = *local_input_ptr++;
 489         // Multiply-accumulate
 490         for (int i = 0; i < 2; i++)
 491         {
 492           acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
 493         }
 494         local_filter_ptr += 2;
 495         acc_buffer_ptr += 2;
 496       }
 497       input_ptr += input_ptr_increment;
 498     }
 499   }
 500 };
 501
 502 template <> struct FloatDepthwiseConvKernel<true, 3, 2>
 503 {
 504   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 505                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 506                   float *acc_buffer_ptr)
 507   {
 508     (void)input_depth;
 509     (void)depth_multiplier;
 510
 511     // Load the filters
 512     float32x2_t filter[3];
 513     for (int i = 0; i < 3; i++)
 514     {
 515       filter[i] = vld1_f32(filter_ptr + 2 * i);
 516     }
 517     // Handle one output pixel at a time.
 518     for (int outp = 0; outp < num_output_pixels; outp++)
 519     {
 520       const float32x2_t input01 = vld1_f32(input_ptr);
 521       const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
 522       // Load the accumulators from acc_buffer
 523       float32x2_t acc[3];
 524       for (int i = 0; i < 3; i++)
 525       {
 526         acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
 527       }
 528       // Multiply-accumulate for each input channel there 2 outputs
 529       acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
 530       acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
 531       acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
 532       // Store the accumulators back to acc_buffer
 533       for (int i = 0; i < 3; i++)
 534       {
 535         vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
 536       }
 537       acc_buffer_ptr += 6;
 538       input_ptr += input_ptr_increment;
 539     }
 540   }
 541 };
 542
 543 template <> struct FloatDepthwiseConvKernel<true, 3, 4>
 544 {
 545   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 546                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 547                   float *acc_buffer_ptr)
 548   {
 549     (void)input_depth;
 550     (void)depth_multiplier;
 551
 552     // Load the filters
 553     float32x4_t filter[3];
 554     for (int i = 0; i < 3; i++)
 555     {
 556       filter[i] = vld1q_f32(filter_ptr + 4 * i);
 557     }
 558     // Handle one output pixel at a time.
 559     for (int outp = 0; outp < num_output_pixels; outp++)
 560     {
 561       // NOTE: we only want 3 values, so we read it as two ops where
 562       // the second op just duplicates the lane
 563       const float32x2_t input01 = vld1_f32(input_ptr);
 564       const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
 565       // Load the accumulators from acc_buffer
 566       float32x4_t acc[3];
 567       for (int i = 0; i < 3; i++)
 568       {
 569         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 570       }
 571       // Multiply-accumulate all outputs.
 572       acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
 573       acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
 574       acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
 575       // Store the accumulators back to acc_buffer
 576       for (int i = 0; i < 3; i++)
 577       {
 578         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 579       }
 580       acc_buffer_ptr += 12;
 581       input_ptr += input_ptr_increment;
 582     }
 583   }
 584 };
 585
 586 template <> struct FloatDepthwiseConvKernel<true, 1, 8>
 587 {
 588   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 589                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 590                   float *acc_buffer_ptr)
 591   {
 592     (void)input_depth;
 593     (void)depth_multiplier;
 594
 595     // Load the filters
 596     float32x4_t filter[2];
 597     for (int i = 0; i < 2; i++)
 598     {
 599       filter[i] = vld1q_f32(filter_ptr + 4 * i);
 600     }
 601     // Handle one output pixel at a time.
 602     for (int outp = 0; outp < num_output_pixels; outp++)
 603     {
 604       // Load the inputs
 605       const float input_val = *input_ptr;
 606       input_ptr += input_ptr_increment;
 607       // Load the accumulators from acc_buffer
 608       float32x4_t acc[2];
 609       for (int i = 0; i < 2; i++)
 610       {
 611         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 612       }
 613       // Multiply-accumulate
 614       for (int i = 0; i < 2; i++)
 615       {
 616         acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
 617       }
 618       // Store the accumulators back to acc_buffer
 619       for (int i = 0; i < 2; i++)
 620       {
 621         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 622       }
 623       acc_buffer_ptr += 8;
 624     }
 625   }
 626 };
 627
 628 template <> struct FloatDepthwiseConvKernel<true, 1, 32>
 629 {
 630   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 631                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 632                   float *acc_buffer_ptr)
 633   {
 634     (void)input_depth;
 635     (void)depth_multiplier;
 636
 637     // Load the filters
 638     float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
 639     float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
 640     float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
 641     float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
 642     float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
 643     float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
 644     float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
 645     float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
 646
 647     // Handle one output pixel at a time.
 648     for (int outp = 0; outp < num_output_pixels; outp++)
 649     {
 650       // Load the inputs
 651       const float input_val = *input_ptr;
 652       input_ptr += input_ptr_increment;
 653       // Load the accumulators from acc_buffer
 654       float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
 655       float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
 656       float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
 657       float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
 658       float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
 659       float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
 660       float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
 661       float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
 662       // Multiply-accumulate
 663       acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
 664       acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
 665       acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
 666       acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
 667       acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
 668       acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
 669       acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
 670       acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
 671       // Store the accumulators back to acc_buffer
 672       vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
 673       vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
 674       vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
 675       vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
 676       vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
 677       vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
 678       vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
 679       vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
 680       acc_buffer_ptr += 32;
 681     }
 682   }
 683 };
 684
 685 template <> struct FloatDepthwiseConvKernel<true, 1, 20>
 686 {
 687   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 688                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 689                   float *acc_buffer_ptr)
 690   {
 691     (void)input_depth;
 692     (void)depth_multiplier;
 693
 694     // Load the filters
 695     float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
 696     float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
 697     float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
 698     float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
 699     float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
 700
 701     // Handle one output pixel at a time.
 702     for (int outp = 0; outp < num_output_pixels; outp++)
 703     {
 704       // Load the inputs
 705       const float input_val = *input_ptr;
 706       input_ptr += input_ptr_increment;
 707       // Load the accumulators from acc_buffer
 708       float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
 709       float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
 710       float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
 711       float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
 712       float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
 713       // Multiply-accumulate
 714       acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
 715       acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
 716       acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
 717       acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
 718       acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
 719       // Store the accumulators back to acc_buffer
 720       vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
 721       vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
 722       vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
 723       vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
 724       vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
 725       acc_buffer_ptr += 20;
 726     }
 727   }
 728 };
 729
 730 template <> struct FloatDepthwiseConvKernel<true, 0, 16>
 731 {
 732   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 733                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 734                   float *acc_buffer_ptr)
 735   {
 736     (void)depth_multiplier;
 737
 738     // Handle one output pixel at a time.
 739     for (int outp = 0; outp < num_output_pixels; outp++)
 740     {
 741       const float *local_filter_ptr = filter_ptr;
 742       const float *local_input_ptr = input_ptr;
 743       for (int ic = 0; ic < input_depth; ic++)
 744       {
 745         // Load the filters
 746         float32x4_t filter[4];
 747         for (int i = 0; i < 4; i++)
 748         {
 749           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
 750         }
 751         local_filter_ptr += 16;
 752         // Load the inputs
 753         const float input_val = *local_input_ptr++;
 754         // Load the accumulators from acc_buffer
 755         float32x4_t acc[4];
 756         for (int i = 0; i < 4; i++)
 757         {
 758           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 759         }
 760         // Multiply-accumulate
 761         for (int i = 0; i < 4; i++)
 762         {
 763           acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
 764         }
 765         // Store the accumulators back to acc_buffer
 766         for (int i = 0; i < 4; i++)
 767         {
 768           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 769         }
 770         acc_buffer_ptr += 16;
 771       }
 772       input_ptr += input_ptr_increment;
 773     }
 774   }
 775 };
 776
 777 template <> struct FloatDepthwiseConvKernel<true, 8, 1>
 778 {
 779   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 780                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 781                   float *acc_buffer_ptr)
 782   {
 783     (void)input_depth;
 784     (void)depth_multiplier;
 785
 786     // Load the filters
 787     float32x4_t filter[2];
 788     for (int i = 0; i < 2; i++)
 789     {
 790       filter[i] = vld1q_f32(filter_ptr + 4 * i);
 791     }
 792     // Handle one output pixel at a time.
 793     for (int outp = 0; outp < num_output_pixels; outp++)
 794     {
 795       // Load the inputs
 796       float32x4_t input[2];
 797       for (int i = 0; i < 2; i++)
 798       {
 799         input[i] = vld1q_f32(input_ptr + 4 * i);
 800       }
 801       // Load the accumulators from acc_buffer
 802       float32x4_t acc[2];
 803       for (int i = 0; i < 2; i++)
 804       {
 805         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
 806       }
 807       // Multiply-accumulate
 808       for (int i = 0; i < 2; i++)
 809       {
 810         acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
 811       }
 812       // Store the accumulators back to acc_buffer
 813       for (int i = 0; i < 2; i++)
 814       {
 815         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
 816       }
 817       acc_buffer_ptr += 8;
 818       input_ptr += input_ptr_increment;
 819     }
 820   }
 821 };
 822
 823 template <> struct FloatDepthwiseConvKernel<true, 2, 1>
 824 {
 825   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 826                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 827                   float *acc_buffer_ptr)
 828   {
 829     (void)input_depth;
 830     (void)depth_multiplier;
 831
 832     float32x2_t filter = vld1_f32(filter_ptr);
 833     float32x4_t filter_x4 = vcombine_f32(filter, filter);
 834     int outp = 0;
 835
 836     // Handle two output pixels at a time.
 837     for (; outp <= num_output_pixels - 2; outp += 2)
 838     {
 839       // Load the inputs
 840       float32x2_t input_1 = vld1_f32(input_ptr);
 841       input_ptr += input_ptr_increment;
 842       float32x2_t input_2 = vld1_f32(input_ptr);
 843       input_ptr += input_ptr_increment;
 844       float32x4_t input = vcombine_f32(input_1, input_2);
 845
 846       // Load the accumulators from acc_buffer
 847       float32x4_t acc = vld1q_f32(acc_buffer_ptr);
 848
 849       // Multiply-accumulate
 850       acc = vmlaq_f32(acc, input, filter_x4);
 851
 852       // Store the accumulators back to acc_buffer
 853       vst1q_f32(acc_buffer_ptr, acc);
 854       acc_buffer_ptr += 4;
 855     }
 856     // Handle one output pixel at a time.
 857     for (; outp < num_output_pixels; outp++)
 858     {
 859       // Load the inputs
 860       float32x2_t input = vld1_f32(input_ptr);
 861       input_ptr += input_ptr_increment;
 862
 863       // Load the accumulators from acc_buffer
 864       float32x2_t acc = vld1_f32(acc_buffer_ptr);
 865
 866       // Multiply-accumulate
 867       acc = vmla_f32(acc, input, filter);
 868
 869       // Store the accumulators back to acc_buffer
 870       vst1_f32(acc_buffer_ptr, acc);
 871       acc_buffer_ptr += 2;
 872     }
 873   }
 874 };
 875
 876 template <> struct FloatDepthwiseConvKernel<true, 4, 1>
 877 {
 878   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
 879                   const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
 880                   float *acc_buffer_ptr)
 881   {
 882     (void)input_depth;
 883     (void)depth_multiplier;
 884
 885     float32x4_t filter = vld1q_f32(filter_ptr);
 886
 887     // Handle one output pixel at a time.
 888     for (int outp = 0; outp < num_output_pixels; outp++)
 889     {
 890       // Load the inputs
 891       float32x4_t input = vld1q_f32(input_ptr);
 892       // Load the accumulators from acc_buffer
 893       float32x4_t acc = vld1q_f32(acc_buffer_ptr);
 894       // Multiply-accumulate
 895       acc = vmlaq_f32(acc, input, filter);
 896       // Store the accumulators back to acc_buffer
 897       vst1q_f32(acc_buffer_ptr, acc);
 898       acc_buffer_ptr += 4;
 899       input_ptr += input_ptr_increment;
 900     }
 901   }
 902 };
 903 #endif
 904
 905 // Accumulates the effect of one row of the filter, on a segment of one row
 906 // of the output, accessing the corresponding one row of the input.
 907 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
 908 void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width,
 909                                 const float *input_data, int pad_width, int depth_multiplier,
 910                                 int filter_width, const float *filter_data, int out_x_buffer_start,
 911                                 int out_x_buffer_end, int output_depth, float *acc_buffer)
 912 {
 913   // Sanity check parameters. This is important in particular to ensure
 914   // that we keep the number of template instantiations minimal, so we don't
 915   // increase binary size unnecessarily.
 916   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
 917   static_assert(kFixedInputDepth || kAllowStrided, "");
 918   assert(stride == 1 || kAllowStrided);
 919   if (kFixedInputDepth)
 920   {
 921     assert(input_depth == kFixedInputDepth);
 922   }
 923   if (kFixedDepthMultiplier)
 924   {
 925     assert(depth_multiplier == kFixedDepthMultiplier);
 926   }
 927   assert(output_depth == input_depth * depth_multiplier);
 928   const int input_ptr_increment = stride * input_depth;
 929   const float *filter_base_ptr = filter_data;
 930   for (int filter_x = 0; filter_x < filter_width; ++filter_x)
 931   {
 932     // For the current (filter_x, filter_y) point in the filter,
 933     // compute the boundaries of the corresponding output row segment.
 934     int out_x_loop_start_unclamped = 0;
 935     int out_x_loop_end_unclamped = 0;
 936     if (kAllowStrided)
 937     {
 938       if (stride == 2)
 939       {
 940         out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
 941         out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
 942       }
 943       else if (stride == 4)
 944       {
 945         out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
 946         out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
 947       }
 948       else
 949       {
 950         out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
 951         out_x_loop_end_unclamped =
 952           (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
 953       }
 954     }
 955     else
 956     {
 957       out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
 958       out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
 959     }
 960     // The kernel will have to iterate on the segment of the
 961     // output row that starts at out_x_loop_start and out_x_loop_end.
 962     const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
 963     const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
 964
 965     float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
 966     const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
 967     const float *input_ptr = input_data + in_x_origin * input_depth;
 968     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
 969     FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
 970       num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment,
 971       filter_base_ptr, acc_buffer_ptr);
 972     filter_base_ptr += output_depth;
 973   }
 974 }
 975
 976 // generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
 977 inline void FloatDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
 978                                               int input_width, const float *input_data,
 979                                               int pad_width, int depth_multiplier, int filter_width,
 980                                               const float *filter_data, int out_x_buffer_start,
 981                                               int out_x_buffer_end, int output_depth,
 982                                               float *acc_buffer)
 983 {
 984   const float *filter_base_ptr = filter_data;
 985   for (int filter_x = 0; filter_x < filter_width; ++filter_x)
 986   {
 987     const int out_x_loop_start =
 988       std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
 989     const int out_x_loop_end =
 990       std::min(out_x_buffer_end,
 991                (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
 992
 993     float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
 994     const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
 995     const float *input_ptr = input_data + in_x_origin * input_depth;
 996     const int input_ptr_increment = (stride - 1) * input_depth;
 997     for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
 998     {
 999       const float *filter_ptr = filter_base_ptr;
1000       for (int ic = 0; ic < input_depth; ++ic)
1001       {
1002         const float input_val = *input_ptr++;
1003         for (int m = 0; m < depth_multiplier; m++)
1004         {
1005           const float filter_val = *filter_ptr++;
1006           *acc_buffer_ptr++ += filter_val * input_val;
1007         }
1008       }
1009       input_ptr += input_ptr_increment;
1010     }
1011     filter_base_ptr += output_depth;
1012   }
1013 }
1014
1015 // Initializes the accumulator buffer with bias values.
1016 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
1017                                        const float *bias_data, float *acc_buffer)
1018 {
1019   // TODO(benoitjacob): This might need optimized specializations
1020   // for small output_depth values, if that ever becomes an important
1021   // case (like it was for some quantized DepthwiseConv cases).
1022   for (int i = 0; i < num_output_pixels; i++)
1023   {
1024     memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
1025   }
1026 }
1027
1028 // DepthwiseConv can run with multi threads on the dim specified by thread_dim.
1029 // Each thread processes output elements on dim, thread_dim, in the range of
1030 // [thread_start, thread_end).
1031 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
1032 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
1033 inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
1034                               const float *input_data, const Shape &filter_shape,
1035                               const float *filter_data, const Shape &bias_shape,
1036                               const float *bias_data, const Shape &output_shape, float *output_data,
1037                               int thread_start, int thread_end, int thread_dim)
1038 {
1039   UNUSED_RELEASE(bias_shape);
1040   const int stride_width = params.stride_width;
1041   const int stride_height = params.stride_height;
1042   const int pad_width = params.padding_values.width;
1043   const int pad_height = params.padding_values.height;
1044   const int depth_multiplier = params.depth_multiplier;
1045   const float output_activation_min = params.float_activation_min;
1046   const float output_activation_max = params.float_activation_max;
1047   const int dilation_width_factor = params.dilation_width_factor;
1048   const int dilation_height_factor = params.dilation_height_factor;
1049   assert(input_shape.DimensionsCount() == 4);
1050   assert(filter_shape.DimensionsCount() == 4);
1051   assert(output_shape.DimensionsCount() == 4);
1052   assert(thread_dim == 0 || thread_dim == 1);
1053
1054   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
1055   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1056   const int input_height = input_shape.Dims(1);
1057   const int input_width = input_shape.Dims(2);
1058   const int input_depth = input_shape.Dims(3);
1059   const int filter_height = filter_shape.Dims(1);
1060   const int filter_width = filter_shape.Dims(2);
1061   const int output_height = output_shape.Dims(1);
1062   const int output_width = output_shape.Dims(2);
1063   assert(output_depth == input_depth * depth_multiplier);
1064   assert(bias_shape.FlatSize() == output_depth);
1065
1066   static const int kAccBufferMaxSize = 4832;
1067   float acc_buffer[kAccBufferMaxSize];
1068   assert(kAccBufferMaxSize >= output_depth);
1069   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1070   const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1071   assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1072   assert(kAccBufferActualSize <= kAccBufferMaxSize);
1073   assert(kOutputPixelsInAccBuffer >= 1);
1074
1075   UNUSED_RELEASE(kAccBufferActualSize);
1076
1077   // row_accum_func will point to the core accumulation function to be used
1078   // for this DepthwiseConv op.
1079   using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
1080   row_accum_func_t row_accum_func = nullptr;
1081
1082 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1083   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
1084       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
1085       depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
1086   {                                                                                               \
1087     row_accum_func =                                                                              \
1088       FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;       \
1089   }
1090
1091 #ifdef USE_NEON
1092   // We go over our list of kernels by decreasing order of preference
1093   // for the cases where multiple kernels could apply.
1094
1095   // Start with the fastest kernels: AllowStrided=false, fixed input depth.
1096
1097   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
1098   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
1099
1100   // Next come the strided kernels: AllowStrided=true, fixed input depth.
1101   // They are a bit less efficient, but allow stride!=1.
1102
1103   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
1104   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
1105   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
1106   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
1107   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
1108   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
1109   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
1110   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
1111
1112   // Finally, the kernels allowing a variable input depth,
1113   // these are the least efficient but most general kernels.
1114
1115   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
1116   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
1117   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
1118   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
1119
1120 #endif // USE_NEON
1121
1122 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
1123
1124   // No matching fast kernel found, use slow fallback.
1125   if (!row_accum_func)
1126   {
1127     row_accum_func = FloatDepthwiseConvAccumRowGeneric;
1128   }
1129
1130   const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
1131   const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1132   const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1133
1134   // Now that we have determined row_accum_func, we can start work.
1135   int batch_start = 0;
1136   int batch_end = batches;
1137   int row_start = 0;
1138   int row_end = output_height;
1139   int output_ptr_offset = 0;
1140
1141   switch (thread_dim)
1142   {
1143     case 0:
1144       // Multithread along with the batch axis
1145       assert(thread_start >= 0);
1146       assert(thread_end <= batches);
1147       batch_start = thread_start;
1148       batch_end = thread_end;
1149       output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
1150       break;
1151     case 1:
1152       // Multithread along with the row axis
1153       assert(thread_start >= 0);
1154       assert(thread_end <= output_height);
1155       row_start = thread_start;
1156       row_end = thread_end;
1157       output_ptr_offset = row_start * output_width * output_depth;
1158       break;
1159   }
1160
1161   float *output_ptr = output_data + output_ptr_offset;
1162   int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
1163
1164   for (int b = batch_start; b < batch_end; ++b)
1165   {
1166     for (int out_y = row_start; out_y < row_end; ++out_y)
1167     {
1168       const int in_y_origin = (out_y * stride_height) - pad_height;
1169       const int filter_y_start =
1170         std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1171       const int filter_y_end =
1172         std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1173                                   dilation_height_factor);
1174       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1175            out_x_buffer_start += kOutputPixelsInAccBuffer)
1176       {
1177         const int out_x_buffer_end =
1178           std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1179         // We call a 'pixel' a group of activation that share all but the
1180         // 'depth'/'channel' coordinate. num_output_pixels is the number of
1181         // output pixels that we will accumulate in this loop iteration.
1182         const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1183         // Initialize our local accumulator with the bias values, so we don't
1184         // have to add them later.
1185         DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1186         // Accumulation loop. Most of the time should be spent in here.
1187         for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1188         {
1189           const int in_y = in_y_origin + dilation_height_factor * filter_y;
1190           row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1191                          input_data + in_y * input_height_stride + b * input_batch_stride,
1192                          pad_width, depth_multiplier, filter_width,
1193                          filter_data + filter_y * filter_height_stride, out_x_buffer_start,
1194                          out_x_buffer_end, output_depth, acc_buffer);
1195         }
1196         // Finished accumulating. Now store to destination.
1197         const int num_output_values = output_depth * num_output_pixels;
1198         int i = 0;
1199 // TODO(benoitjacob) optimized code goes here
1200 #ifdef USE_NEON
1201         // Handle 16 values at a time
1202         for (; i <= num_output_values - 16; i += 16)
1203         {
1204           float32x4_t acc[4];
1205           for (int k = 0; k < 4; k++)
1206           {
1207             acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
1208           }
1209           for (int k = 0; k < 4; k++)
1210           {
1211             acc[k] = vmaxq_f32(vdupq_n_f32(output_activation_min),
1212                                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
1213           }
1214           for (int k = 0; k < 4; k++)
1215           {
1216             vst1q_f32(output_ptr + 4 * k, acc[k]);
1217           }
1218           output_ptr += 16;
1219         }
1220         // Handle 4 values at a time
1221         for (; i <= num_output_values - 4; i += 4)
1222         {
1223           float32x4_t acc = vld1q_f32(acc_buffer + i);
1224
1225           acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
1226                           vminq_f32(vdupq_n_f32(output_activation_max), acc));
1227
1228           vst1q_f32(output_ptr, acc);
1229           output_ptr += 4;
1230         }
1231 #endif
1232         // Handle leftover values, one by one. This is very slow.
1233         for (; i < num_output_values; i++)
1234         {
1235           float acc = acc_buffer[i];
1236           acc = std::max(output_activation_min, std::min(output_activation_max, acc));
1237
1238           *output_ptr++ = acc;
1239         }
1240       }
1241     }
1242     output_ptr += batch_step;
1243   }
1244 }
1245
1246 } // nnfw
1247 } // cker
1248 } // optimized
1249
1250 #endif