2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_FULLY_CONNECTED_H__
19 #define __NNFW_CKER_FULLY_CONNECTED_H__
21 #include <ruy/context.h>
22 #include "cker/Shape.h"
23 #include "cker/Types.h"
24 #include "cker/Utils.h"
25 #include "cker/TensorUtils.h"
35 FCTempArena(void) : prepared(false), input_quantized(), scaling_factors(), accum_scratch()
40 void prepare(const Shape &input_shape, const Shape &weights_shape)
42 auto input_size = input_shape.FlatSize();
43 input_quantized.resize(input_size);
45 assert(weights_shape.DimensionsCount() == 2);
46 int batch_size = input_size / weights_shape.Dims(1);
47 scaling_factors.resize(batch_size);
53 std::vector<int8_t> input_quantized;
54 std::vector<float> scaling_factors;
55 std::vector<int32_t> accum_scratch;
58 inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape,
59 const float *input_data, const Shape &weights_shape,
60 const float *weights_data, const Shape &, const float *bias_data,
61 const Shape &, float *output_data)
63 int total_input_size = input_shape.FlatSize();
64 int input_size = weights_shape.Dims(1);
65 const int batch_size = total_input_size / input_size;
66 const int num_units = weights_shape.Dims(0);
68 // Output = bias if bias tensor exists.
71 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
75 ZeroVector(output_data, batch_size * num_units);
78 // Compute output += weight * input
79 MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
80 output_data, /*result_stride=*/1);
82 if (params.activation != FusedActivationFunctionType::kNone)
84 // Apply activation function
85 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
89 inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape,
90 const uint8_t *input_data, const Shape &filter_shape,
91 const uint8_t *filter_data, const Shape &bias_shape,
92 const int32_t *bias_data, const Shape &output_shape,
95 UNUSED_RELEASE(input_shape);
96 UNUSED_RELEASE(bias_shape);
97 const int32_t input_offset = params.input_offset;
98 const int32_t filter_offset = params.weights_offset;
99 const int32_t output_offset = params.output_offset;
100 const int32_t output_multiplier = params.output_multiplier;
101 const int output_shift = params.output_shift;
102 const int32_t output_activation_min = params.quantized_activation_min;
103 const int32_t output_activation_max = params.quantized_activation_max;
104 assert(filter_shape.DimensionsCount() >= 2);
105 assert(output_shape.DimensionsCount() >= 1);
107 assert(output_activation_min <= output_activation_max);
108 // TODO(benoitjacob): This really should be:
109 // const int batches = ArraySize(output_dims, 1);
110 // but the current --variable_batch hack consists in overwriting the 3rd
111 // dimension with the runtime batch size, as we don't keep track for each
112 // array of which dimension is the batch dimension in it.
113 const int output_dim_count = output_shape.DimensionsCount();
114 const int filter_dim_count = filter_shape.DimensionsCount();
115 const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
116 const int output_depth =
117 MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
118 const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
119 for (int b = 0; b < batches; ++b)
121 for (int out_c = 0; out_c < output_depth; ++out_c)
124 for (int d = 0; d < accum_depth; ++d)
126 int32_t input_val = input_data[b * accum_depth + d];
127 int32_t filter_val = filter_data[out_c * accum_depth + d];
128 acc += (filter_val + filter_offset) * (input_val + input_offset);
132 acc += bias_data[out_c];
134 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
135 acc += output_offset;
136 acc = std::max(acc, output_activation_min);
137 acc = std::min(acc, output_activation_max);
138 output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
143 inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape &input_shape,
144 const float *input_data, const Shape &filter_shape,
145 const int8_t *filter_data, const Shape &, const float *bias_data,
146 const Shape &output_shape, float *output_data,
147 FCTempArena &temp_arena, ruy::Context *ruy_context)
149 int total_input_size = input_shape.FlatSize();
150 const int input_size = filter_shape.Dims(1);
151 const int batch_size = total_input_size / input_size;
152 const int num_units = filter_shape.Dims(0);
154 // Output = bias if bias tensor exists.
157 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
161 ZeroVector(output_data, batch_size * num_units);
164 // Save matrix multiplication computation for all zero input.
165 if (IsZeroVector(input_data, total_input_size))
167 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
171 // Quantize input from float to uint8 + quantization params (scaling factor).
172 float unused_min, unused_max;
173 float *scaling_factors_ptr = temp_arena.scaling_factors.data();
174 int8_t *quant_data = temp_arena.input_quantized.data();
176 // Quantize each batch independently.
177 for (int b = 0; b < batch_size; ++b)
179 const int offset = b * input_size;
180 SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
181 &unused_max, &scaling_factors_ptr[b]);
182 // Incorporate scaling of the filter.
183 scaling_factors_ptr[b] *= params.weights_scale;
186 // Compute output += weight * quantized_input
188 auto output_size = output_shape.FlatSize();
189 temp_arena.accum_scratch.resize(output_size);
190 int32_t *scratch = temp_arena.accum_scratch.data();
191 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
192 scaling_factors_ptr, batch_size, scratch, output_data,
193 /*result_stride=*/1, ruy_context);
195 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
196 scaling_factors_ptr, batch_size, output_data,
197 /*result_stride=*/1);
198 UNUSED_RELEASE(ruy_context);
199 UNUSED_RELEASE(output_shape);
202 // Apply activation function to floats.
203 if (params.activation != FusedActivationFunctionType::kNone)
205 // Apply activation function
206 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
211 inline void FullyConnectedSparseWeight(const FullyConnectedParams ¶ms, const Shape &input_shape,
212 const float *input_data, const Shape &weights_shape,
213 const float *weights_data, const Shape &bias_shape,
214 const float *bias_data, const Shape &output_shape,
215 float *output_data, int w0_size, const uint16_t *w1_segments,
216 const uint16_t *w1_indices)
218 UNUSED_RELEASE(params);
219 UNUSED_RELEASE(input_shape);
221 assert(weights_shape.DimensionsCount() == 2);
222 assert(output_shape.DimensionsCount() == 2);
224 const int output_dims_count = output_shape.DimensionsCount();
225 const int weights_dims_count = weights_shape.DimensionsCount();
226 const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
227 const int output_depth =
228 MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
229 const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
231 UNUSED_RELEASE(bias_shape);
234 VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
238 ZeroVector(output_data, batches * output_depth);
240 for (int b = 0; b < batches; ++b)
242 for (int idx_0 = 0; idx_0 < w0_size; ++idx_0)
244 for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
246 int idx_1 = w1_indices[pw1];
247 output_data[b * output_depth + idx_0] +=
248 weights_data[pw1] * input_data[b * accum_depth + idx_1];
252 if (params.activation != FusedActivationFunctionType::kNone)
254 // Apply activation function
255 ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
262 #endif // __NNFW_CKER_FULLY_CONNECTED_H__