compute/cker/include/cker/operation/Common.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_COMMON_H__
  19 #define __NNFW_CKER_COMMON_H__
  20
  21 #include "cker/neon/neon_check.h"
  22 #include "cker/Utils.h"
  23
  24 namespace nnfw
  25 {
  26 namespace cker
  27 {
  28
  29 inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const float *bias_data,
  30                          int array_size, float *array_data)
  31 {
  32   // Note: see b/132215220: in May 2019 we thought it would be OK to replace
  33   // this with the Eigen one-liner:
  34   //   return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
  35   // This turned out to severely regress performance: +4ms (i.e. 8%) on
  36   // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
  37   assert((array_size % bias_size) == 0);
  38 #ifdef USE_NEON
  39   float *array_ptr = array_data;
  40   float *array_end_ptr = array_ptr + array_size;
  41   const auto clamp_min_vec = vdupq_n_f32(clamp_min);
  42   const auto clamp_max_vec = vdupq_n_f32(clamp_max);
  43   for (; array_ptr != array_end_ptr; array_ptr += bias_size)
  44   {
  45     int i = 0;
  46     for (; i <= bias_size - 16; i += 16)
  47     {
  48       auto b0 = vld1q_f32(bias_data + i);
  49       auto b1 = vld1q_f32(bias_data + i + 4);
  50       auto b2 = vld1q_f32(bias_data + i + 8);
  51       auto b3 = vld1q_f32(bias_data + i + 12);
  52       auto a0 = vld1q_f32(array_ptr + i);
  53       auto a1 = vld1q_f32(array_ptr + i + 4);
  54       auto a2 = vld1q_f32(array_ptr + i + 8);
  55       auto a3 = vld1q_f32(array_ptr + i + 12);
  56       auto x0 = vaddq_f32(a0, b0);
  57       auto x1 = vaddq_f32(a1, b1);
  58       auto x2 = vaddq_f32(a2, b2);
  59       auto x3 = vaddq_f32(a3, b3);
  60       x0 = vmaxq_f32(clamp_min_vec, x0);
  61       x1 = vmaxq_f32(clamp_min_vec, x1);
  62       x2 = vmaxq_f32(clamp_min_vec, x2);
  63       x3 = vmaxq_f32(clamp_min_vec, x3);
  64       x0 = vminq_f32(clamp_max_vec, x0);
  65       x1 = vminq_f32(clamp_max_vec, x1);
  66       x2 = vminq_f32(clamp_max_vec, x2);
  67       x3 = vminq_f32(clamp_max_vec, x3);
  68       vst1q_f32(array_ptr + i, x0);
  69       vst1q_f32(array_ptr + i + 4, x1);
  70       vst1q_f32(array_ptr + i + 8, x2);
  71       vst1q_f32(array_ptr + i + 12, x3);
  72     }
  73     for (; i <= bias_size - 4; i += 4)
  74     {
  75       auto b = vld1q_f32(bias_data + i);
  76       auto a = vld1q_f32(array_ptr + i);
  77       auto x = vaddq_f32(a, b);
  78       x = vmaxq_f32(clamp_min_vec, x);
  79       x = vminq_f32(clamp_max_vec, x);
  80       vst1q_f32(array_ptr + i, x);
  81     }
  82     for (; i < bias_size; i++)
  83     {
  84       array_ptr[i] =
  85           ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
  86     }
  87   }
  88 #else // not NEON
  89   for (int array_offset = 0; array_offset < array_size; array_offset += bias_size)
  90   {
  91     for (int i = 0; i < bias_size; i++)
  92     {
  93       array_data[array_offset + i] = ActivationFunctionWithMinMax(
  94           array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
  95     }
  96   }
  97 #endif
  98 }
  99
 100 } // namespace cker
 101 } // namespace nnfw
 102
 103 #endif // __NNFW_CKER_COMMON_H__