2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_FULLY_CONNECTED_H__
19 #define __NNFW_CKER_FULLY_CONNECTED_H__
21 #include <ruy/context.h>
22 #include "cker/operation/FullyConnectedDense16x1.h"
23 #include "cker/operation/FullyConnectedSparse16x1.h"
24 #include "cker/Shape.h"
25 #include "cker/Types.h"
26 #include "cker/Utils.h"
27 #include "cker/TensorUtils.h"
28 #include "cker/neon/neon_check.h"
38 FCTempArena(void) : prepared(false), input_quantized(), scaling_factors(), accum_scratch()
43 void prepare(const Shape &input_shape, const Shape &weights_shape)
45 auto input_size = input_shape.FlatSize();
46 input_quantized.resize(input_size);
48 assert(weights_shape.DimensionsCount() == 2);
49 int batch_size = input_size / weights_shape.Dims(1);
50 scaling_factors.resize(batch_size);
56 std::vector<int8_t> input_quantized;
57 std::vector<float> scaling_factors;
58 std::vector<int32_t> accum_scratch;
61 inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape,
62 const float *input_data, const Shape &weights_shape,
63 const float *weights_data, const Shape &, const float *bias_data,
64 const Shape &, float *output_data)
66 int total_input_size = input_shape.FlatSize();
67 int input_size = weights_shape.Dims(1);
68 const int batch_size = total_input_size / input_size;
69 const int num_units = weights_shape.Dims(0);
71 // Output = bias if bias tensor exists.
74 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
78 ZeroVector(output_data, batch_size * num_units);
81 // Compute output += weight * input
82 MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
83 output_data, /*result_stride=*/1);
85 if (params.activation != FusedActivationFunctionType::kNone)
87 // Apply activation function
88 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
92 inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape,
93 const uint8_t *input_data, const Shape &filter_shape,
94 const uint8_t *filter_data, const Shape &bias_shape,
95 const int32_t *bias_data, const Shape &output_shape,
98 UNUSED_RELEASE(input_shape);
99 UNUSED_RELEASE(bias_shape);
100 const int32_t input_offset = params.input_offset;
101 const int32_t filter_offset = params.weights_offset;
102 const int32_t output_offset = params.output_offset;
103 const int32_t output_multiplier = params.output_multiplier;
104 const int output_shift = params.output_shift;
105 const int32_t output_activation_min = params.quantized_activation_min;
106 const int32_t output_activation_max = params.quantized_activation_max;
107 assert(filter_shape.DimensionsCount() >= 2);
108 assert(output_shape.DimensionsCount() >= 1);
110 assert(output_activation_min <= output_activation_max);
111 // TODO(benoitjacob): This really should be:
112 // const int batches = ArraySize(output_dims, 1);
113 // but the current --variable_batch hack consists in overwriting the 3rd
114 // dimension with the runtime batch size, as we don't keep track for each
115 // array of which dimension is the batch dimension in it.
116 const int output_dim_count = output_shape.DimensionsCount();
117 const int filter_dim_count = filter_shape.DimensionsCount();
118 const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
119 const int output_depth =
120 MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
121 const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
122 for (int b = 0; b < batches; ++b)
124 for (int out_c = 0; out_c < output_depth; ++out_c)
127 for (int d = 0; d < accum_depth; ++d)
129 int32_t input_val = input_data[b * accum_depth + d];
130 int32_t filter_val = filter_data[out_c * accum_depth + d];
131 acc += (filter_val + filter_offset) * (input_val + input_offset);
135 acc += bias_data[out_c];
137 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
138 acc += output_offset;
139 acc = std::max(acc, output_activation_min);
140 acc = std::min(acc, output_activation_max);
141 output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
146 inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape &input_shape,
147 const float *input_data, const Shape &filter_shape,
148 const int8_t *filter_data, const Shape &, const float *bias_data,
149 const Shape &output_shape, float *output_data,
150 FCTempArena &temp_arena, ruy::Context *ruy_context)
152 int total_input_size = input_shape.FlatSize();
153 const int input_size = filter_shape.Dims(1);
154 const int batch_size = total_input_size / input_size;
155 const int num_units = filter_shape.Dims(0);
157 // Output = bias if bias tensor exists.
160 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
164 ZeroVector(output_data, batch_size * num_units);
167 // Save matrix multiplication computation for all zero input.
168 if (IsZeroVector(input_data, total_input_size))
170 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
174 // Quantize input from float to uint8 + quantization params (scaling factor).
175 float unused_min, unused_max;
176 float *scaling_factors_ptr = temp_arena.scaling_factors.data();
177 int8_t *quant_data = temp_arena.input_quantized.data();
179 // Quantize each batch independently.
180 for (int b = 0; b < batch_size; ++b)
182 const int offset = b * input_size;
183 SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
184 &unused_max, &scaling_factors_ptr[b]);
185 // Incorporate scaling of the filter.
186 scaling_factors_ptr[b] *= params.weights_scale;
189 // Compute output += weight * quantized_input
191 auto output_size = output_shape.FlatSize();
192 temp_arena.accum_scratch.resize(output_size);
193 int32_t *scratch = temp_arena.accum_scratch.data();
194 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
195 scaling_factors_ptr, batch_size, scratch, output_data,
196 /*result_stride=*/1, ruy_context);
198 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
199 scaling_factors_ptr, batch_size, output_data,
200 /*result_stride=*/1);
201 UNUSED_RELEASE(ruy_context);
202 UNUSED_RELEASE(output_shape);
205 // Apply activation function to floats.
206 if (params.activation != FusedActivationFunctionType::kNone)
208 // Apply activation function
209 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
214 inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams ¶ms,
215 const Shape &input_shape, const float *input_data,
216 const Shape &weights_shape, const float *weights_data,
217 const Shape &bias_shape, const float *bias_data,
218 const Shape &output_shape, float *output_data,
219 const uint16_t *w1_segments,
220 const uint16_t *w1_indices)
222 UNUSED_RELEASE(params);
223 UNUSED_RELEASE(input_shape);
225 assert(weights_shape.DimensionsCount() == 2);
226 assert(output_shape.DimensionsCount() == 2);
228 const int output_dims_count = output_shape.DimensionsCount();
229 const int weights_dims_count = weights_shape.DimensionsCount();
230 const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
231 const int output_depth =
232 MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
233 const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
235 UNUSED_RELEASE(bias_shape);
238 VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
242 ZeroVector(output_data, batches * output_depth);
244 for (int b = 0; b < batches; ++b)
246 for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
248 for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
250 int idx_1 = w1_indices[pw1];
251 output_data[b * output_depth + idx_0] +=
252 weights_data[pw1] * input_data[b * accum_depth + idx_1];
256 if (params.activation != FusedActivationFunctionType::kNone)
258 // Apply activation function
259 ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
266 #endif // __NNFW_CKER_FULLY_CONNECTED_H__