2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 /* Copyright (c) 2018 Mozilla
18 2008-2011 Octasic Inc.
19 2012-2017 Jean-Marc Valin */
21 Redistribution and use in source and binary forms, with or without
22 modification, are permitted provided that the following conditions
25 - Redistributions of source code must retain the above copyright
26 notice, this list of conditions and the following disclaimer.
28 - Redistributions in binary form must reproduce the above copyright
29 notice, this list of conditions and the following disclaimer in the
30 documentation and/or other materials provided with the distribution.
32 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
33 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
34 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
35 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
36 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
37 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
38 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
39 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 #ifndef __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
46 #define __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
48 #include "cker/Shape.h"
49 #include "cker/Types.h"
50 #include "cker/Utils.h"
51 #include "cker/TensorUtils.h"
57 inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams ¶ms,
58 const Shape &input_shape, const float *input_data,
59 const Shape &weights_shape, const float *weights_data,
60 const Shape &bias_shape, const float *bias_data,
61 const Shape &output_shape, float *output_data,
62 const uint16_t *w1_segments, const uint16_t *w1_indices)
64 UNUSED_RELEASE(input_shape);
66 assert(weights_shape.DimensionsCount() == 2);
67 assert(output_shape.DimensionsCount() == 2);
69 const int output_dims_count = output_shape.DimensionsCount();
70 const int weights_dims_count = weights_shape.DimensionsCount();
71 const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
72 const int output_depth =
73 MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
74 const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
76 UNUSED_RELEASE(bias_shape);
79 VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
83 ZeroVector(output_data, batches * output_depth);
85 for (int b = 0; b < batches; ++b)
87 int depth_size = output_depth / 16;
88 for (int idx_0 = 0; idx_0 < depth_size; ++idx_0)
92 y = &output_data[b * output_depth + idx_0 * 16];
93 /* keep y[0..15] in registers for duration of inner loop */
94 float32x4_t y0_3 = vld1q_f32(&y[0]);
95 float32x4_t y4_7 = vld1q_f32(&y[4]);
96 float32x4_t y8_11 = vld1q_f32(&y[8]);
97 float32x4_t y12_15 = vld1q_f32(&y[12]);
98 for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
100 auto idx_1 = w1_indices[pw1];
101 float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]);
104 wvec = vld1q_f32(&weights_data[0]);
105 y0_3 = vmlaq_f32(y0_3, wvec, xj);
106 wvec = vld1q_f32(&weights_data[4]);
107 y4_7 = vmlaq_f32(y4_7, wvec, xj);
108 wvec = vld1q_f32(&weights_data[8]);
109 y8_11 = vmlaq_f32(y8_11, wvec, xj);
110 wvec = vld1q_f32(&weights_data[12]);
111 y12_15 = vmlaq_f32(y12_15, wvec, xj);
115 /* save y[0..15] back to memory */
116 vst1q_f32(&y[0], y0_3);
117 vst1q_f32(&y[4], y4_7);
118 vst1q_f32(&y[8], y8_11);
119 vst1q_f32(&y[12], y12_15);
123 for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
127 auto idx_1 = w1_indices[pw1];
128 xj = input_data[b * accum_depth + idx_1];
129 y = &output_data[b * output_depth + idx_0 * 16];
130 y[0] += weights_data[0] * xj;
131 y[1] += weights_data[1] * xj;
132 y[2] += weights_data[2] * xj;
133 y[3] += weights_data[3] * xj;
134 y[4] += weights_data[4] * xj;
135 y[5] += weights_data[5] * xj;
136 y[6] += weights_data[6] * xj;
137 y[7] += weights_data[7] * xj;
138 y[8] += weights_data[8] * xj;
139 y[9] += weights_data[9] * xj;
140 y[10] += weights_data[10] * xj;
141 y[11] += weights_data[11] * xj;
142 y[12] += weights_data[12] * xj;
143 y[13] += weights_data[13] * xj;
144 y[14] += weights_data[14] * xj;
145 y[15] += weights_data[15] * xj;
151 if (params.activation != FusedActivationFunctionType::kNone)
153 // Apply activation function
154 ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
159 #endif // __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__