compute/cker/include/cker/operation/FullyConnectedSparse16x1.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17 /* Copyright (c) 2018 Mozilla
  18                  2008-2011 Octasic Inc.
  19                  2012-2017 Jean-Marc Valin */
  20 /*
  21    Redistribution and use in source and binary forms, with or without
  22    modification, are permitted provided that the following conditions
  23    are met:
  24
  25    - Redistributions of source code must retain the above copyright
  26    notice, this list of conditions and the following disclaimer.
  27
  28    - Redistributions in binary form must reproduce the above copyright
  29    notice, this list of conditions and the following disclaimer in the
  30    documentation and/or other materials provided with the distribution.
  31
  32    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  33    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  34    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  35    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
  36    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  37    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  38    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  39    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  40    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  41    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  42    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43 */
  44
  45 #ifndef __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
  46 #define __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
  47
  48 #include "cker/Shape.h"
  49 #include "cker/Types.h"
  50 #include "cker/Utils.h"
  51 #include "cker/TensorUtils.h"
  52
  53 namespace nnfw
  54 {
  55 namespace cker
  56 {
  57 inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params,
  58                                            const Shape &input_shape, const float *input_data,
  59                                            const Shape &weights_shape, const float *weights_data,
  60                                            const Shape &bias_shape, const float *bias_data,
  61                                            const Shape &output_shape, float *output_data,
  62                                            const uint16_t *w1_segments, const uint16_t *w1_indices)
  63 {
  64   UNUSED_RELEASE(input_shape);
  65
  66   assert(weights_shape.DimensionsCount() == 2);
  67   assert(output_shape.DimensionsCount() == 2);
  68
  69   const int output_dims_count = output_shape.DimensionsCount();
  70   const int weights_dims_count = weights_shape.DimensionsCount();
  71   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
  72   const int output_depth =
  73       MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
  74   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
  75
  76   UNUSED_RELEASE(bias_shape);
  77   if (bias_data)
  78   {
  79     VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
  80   }
  81   else
  82   {
  83     ZeroVector(output_data, batches * output_depth);
  84   }
  85   for (int b = 0; b < batches; ++b)
  86   {
  87     int depth_size = output_depth / 16;
  88     for (int idx_0 = 0; idx_0 < depth_size; ++idx_0)
  89 #ifdef USE_NEON
  90     {
  91       float *__restrict y;
  92       y = &output_data[b * output_depth + idx_0 * 16];
  93       /* keep y[0..15] in registers for duration of inner loop */
  94       float32x4_t y0_3 = vld1q_f32(&y[0]);
  95       float32x4_t y4_7 = vld1q_f32(&y[4]);
  96       float32x4_t y8_11 = vld1q_f32(&y[8]);
  97       float32x4_t y12_15 = vld1q_f32(&y[12]);
  98       for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
  99       {
 100         auto idx_1 = w1_indices[pw1];
 101         float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]);
 102         float32x4_t wvec;
 103
 104         wvec = vld1q_f32(&weights_data[0]);
 105         y0_3 = vmlaq_f32(y0_3, wvec, xj);
 106         wvec = vld1q_f32(&weights_data[4]);
 107         y4_7 = vmlaq_f32(y4_7, wvec, xj);
 108         wvec = vld1q_f32(&weights_data[8]);
 109         y8_11 = vmlaq_f32(y8_11, wvec, xj);
 110         wvec = vld1q_f32(&weights_data[12]);
 111         y12_15 = vmlaq_f32(y12_15, wvec, xj);
 112
 113         weights_data += 16;
 114       }
 115       /* save y[0..15] back to memory */
 116       vst1q_f32(&y[0], y0_3);
 117       vst1q_f32(&y[4], y4_7);
 118       vst1q_f32(&y[8], y8_11);
 119       vst1q_f32(&y[12], y12_15);
 120     }
 121 #else
 122     {
 123       for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
 124       {
 125         float *__restrict y;
 126         float xj;
 127         auto idx_1 = w1_indices[pw1];
 128         xj = input_data[b * accum_depth + idx_1];
 129         y = &output_data[b * output_depth + idx_0 * 16];
 130         y[0] += weights_data[0] * xj;
 131         y[1] += weights_data[1] * xj;
 132         y[2] += weights_data[2] * xj;
 133         y[3] += weights_data[3] * xj;
 134         y[4] += weights_data[4] * xj;
 135         y[5] += weights_data[5] * xj;
 136         y[6] += weights_data[6] * xj;
 137         y[7] += weights_data[7] * xj;
 138         y[8] += weights_data[8] * xj;
 139         y[9] += weights_data[9] * xj;
 140         y[10] += weights_data[10] * xj;
 141         y[11] += weights_data[11] * xj;
 142         y[12] += weights_data[12] * xj;
 143         y[13] += weights_data[13] * xj;
 144         y[14] += weights_data[14] * xj;
 145         y[15] += weights_data[15] * xj;
 146         weights_data += 16;
 147       }
 148     }
 149 #endif
 150   }
 151   if (params.activation != FusedActivationFunctionType::kNone)
 152   {
 153     // Apply activation function
 154     ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
 155   }
 156 }
 157 } // namespace cker
 158 } // namespace nnfw
 159 #endif // __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__