compute/ruy/include/ruy/NeonTensorUtils.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_RUY_NEON_TENSOR_UTILS_H__
  19 #define __NNFW_RUY_NEON_TENSOR_UTILS_H__
  20
  21 #include "ruy/neon/neon_check.h"
  22
  23 #ifdef USE_NEON
  24
  25 #define kFloatWeightsPerNeonLane 4
  26
  27 namespace nnfw
  28 {
  29 namespace ruy
  30 {
  31
  32 inline bool NeonIsZeroVector(const float *vector, int v_size)
  33 {
  34   // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
  35   // use the main vectorized loop, and we need to process sequentially.
  36   // postamble_start shows the start index where this should happen.
  37   const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
  38
  39   const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
  40   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane)
  41   {
  42     const float32x4_t i_x4_float = vld1q_f32(vector + v);
  43     uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
  44     if (vgetq_lane_u32(cmp_result, 0) == 0)
  45       return false;
  46     if (vgetq_lane_u32(cmp_result, 1) == 0)
  47       return false;
  48     if (vgetq_lane_u32(cmp_result, 2) == 0)
  49       return false;
  50     if (vgetq_lane_u32(cmp_result, 3) == 0)
  51       return false;
  52   }
  53
  54   // Postamble loop
  55   for (int v = postamble_start; v < v_size; ++v)
  56   {
  57     if (vector[v] != 0.0)
  58       return false;
  59   }
  60   return true;
  61 }
  62
  63 } // namespace ruy
  64 } // namespace nnfw
  65
  66 #endif // USE_NEON
  67
  68 #endif // __NNFW_RUY_NEON_TENSOR_UTILS_H__