2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 #ifndef __NNFW_CKER_FULLY_CONNECTED_H__
19 #define __NNFW_CKER_FULLY_CONNECTED_H__
21 #include <ruy/context.h>
22 #include "cker/operation/FullyConnectedDense16x1.h"
23 #include "cker/operation/FullyConnectedSparse16x1.h"
24 #include "cker/operation/optimized/Gemm.h"
25 #include "cker/Shape.h"
26 #include "cker/Types.h"
27 #include "cker/Utils.h"
28 #include "cker/TensorUtils.h"
29 #include "cker/neon/neon_check.h"
39 FCTempArena(void) : prepared(false), input_quantized(), scaling_factors(), accum_scratch()
44 void prepare(const Shape &input_shape, const Shape &weights_shape)
46 auto input_size = input_shape.FlatSize();
47 input_quantized.resize(input_size);
49 assert(weights_shape.DimensionsCount() == 2);
50 int batch_size = input_size / weights_shape.Dims(1);
51 scaling_factors.resize(batch_size);
57 std::vector<int8_t> input_quantized;
58 std::vector<float> scaling_factors;
59 std::vector<int32_t> accum_scratch;
62 #if defined(CKER_X86_PLATFORM)
64 // From tensorflow/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
65 inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape,
66 const float *input_data, const Shape &weights_shape,
67 const float *weights_data, const Shape &,
68 const float *optional_bias_data, const Shape &output_shape,
71 const int dims_count = weights_shape.DimensionsCount();
72 const int input_rows = weights_shape.Dims(dims_count - 1);
73 MatrixParams<float> rhs_params;
74 rhs_params.order = Order::kColMajor;
75 rhs_params.rows = input_rows;
76 rhs_params.cols = input_shape.FlatSize() / input_rows;
77 rhs_params.cache_policy = optimized::DefaultCachePolicy(params.rhs_cacheable);
79 MatrixParams<float> lhs_params;
80 lhs_params.order = Order::kRowMajor;
81 lhs_params.cols = weights_shape.Dims(dims_count - 1);
82 lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
83 lhs_params.cache_policy = optimized::DefaultCachePolicy(params.lhs_cacheable);
84 MatrixParams<float> dst_params;
85 dst_params.order = Order::kColMajor;
86 dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
87 dst_params.cols = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
88 GemmParams<float, float> gemm_params;
89 gemm_params.bias = optional_bias_data;
90 gemm_params.clamp_min = params.float_activation_min;
91 gemm_params.clamp_max = params.float_activation_max;
92 optimized::Gemm(lhs_params, weights_data, rhs_params, input_data, dst_params, output_data,
96 #else // CKER_X86_PLATFORM
98 inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape,
99 const float *input_data, const Shape &weights_shape,
100 const float *weights_data, const Shape &, const float *bias_data,
101 const Shape &, float *output_data)
103 int total_input_size = input_shape.FlatSize();
104 int input_size = weights_shape.Dims(1);
105 const int batch_size = total_input_size / input_size;
106 const int num_units = weights_shape.Dims(0);
108 // Output = bias if bias tensor exists.
111 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
115 ZeroVector(output_data, batch_size * num_units);
118 // Compute output += weight * input
119 MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
120 output_data, /*result_stride=*/1);
122 if (params.activation != FusedActivationFunctionType::kNone)
124 // Apply activation function
125 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
129 #endif // CKER_X86_PLATFORM
131 inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape,
132 const uint8_t *input_data, const Shape &filter_shape,
133 const uint8_t *filter_data, const Shape &bias_shape,
134 const int32_t *bias_data, const Shape &output_shape,
135 uint8_t *output_data)
137 UNUSED_RELEASE(input_shape);
138 UNUSED_RELEASE(bias_shape);
139 const int32_t input_offset = params.input_offset;
140 const int32_t filter_offset = params.weights_offset;
141 const int32_t output_offset = params.output_offset;
142 const int32_t output_multiplier = params.output_multiplier;
143 const int output_shift = params.output_shift;
144 const int32_t output_activation_min = params.quantized_activation_min;
145 const int32_t output_activation_max = params.quantized_activation_max;
146 assert(filter_shape.DimensionsCount() >= 2);
147 assert(output_shape.DimensionsCount() >= 1);
149 assert(output_activation_min <= output_activation_max);
150 // TODO(benoitjacob): This really should be:
151 // const int batches = ArraySize(output_dims, 1);
152 // but the current --variable_batch hack consists in overwriting the 3rd
153 // dimension with the runtime batch size, as we don't keep track for each
154 // array of which dimension is the batch dimension in it.
155 const int output_dim_count = output_shape.DimensionsCount();
156 const int filter_dim_count = filter_shape.DimensionsCount();
157 const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
158 const int output_depth =
159 MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
160 const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
161 for (int b = 0; b < batches; ++b)
163 for (int out_c = 0; out_c < output_depth; ++out_c)
166 for (int d = 0; d < accum_depth; ++d)
168 int32_t input_val = input_data[b * accum_depth + d];
169 int32_t filter_val = filter_data[out_c * accum_depth + d];
170 acc += (filter_val + filter_offset) * (input_val + input_offset);
174 acc += bias_data[out_c];
176 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
177 acc += output_offset;
178 acc = std::max(acc, output_activation_min);
179 acc = std::min(acc, output_activation_max);
180 output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
185 inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape &input_shape,
186 const float *input_data, const Shape &filter_shape,
187 const int8_t *filter_data, const Shape &, const float *bias_data,
188 const Shape &output_shape, float *output_data,
189 FCTempArena &temp_arena, ruy::Context *ruy_context)
191 int total_input_size = input_shape.FlatSize();
192 const int input_size = filter_shape.Dims(1);
193 const int batch_size = total_input_size / input_size;
194 const int num_units = filter_shape.Dims(0);
196 // Output = bias if bias tensor exists.
199 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
203 ZeroVector(output_data, batch_size * num_units);
206 // Save matrix multiplication computation for all zero input.
207 if (IsZeroVector(input_data, total_input_size))
209 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
213 // Quantize input from float to uint8 + quantization params (scaling factor).
214 float unused_min, unused_max;
215 float *scaling_factors_ptr = temp_arena.scaling_factors.data();
216 int8_t *quant_data = temp_arena.input_quantized.data();
218 // Quantize each batch independently.
219 for (int b = 0; b < batch_size; ++b)
221 const int offset = b * input_size;
222 SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
223 &unused_max, &scaling_factors_ptr[b]);
224 // Incorporate scaling of the filter.
225 scaling_factors_ptr[b] *= params.weights_scale;
228 // Compute output += weight * quantized_input
230 auto output_size = output_shape.FlatSize();
231 temp_arena.accum_scratch.resize(output_size);
232 int32_t *scratch = temp_arena.accum_scratch.data();
233 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
234 scaling_factors_ptr, batch_size, scratch, output_data,
235 /*result_stride=*/1, ruy_context);
237 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
238 scaling_factors_ptr, batch_size, output_data,
239 /*result_stride=*/1);
240 UNUSED_RELEASE(ruy_context);
241 UNUSED_RELEASE(output_shape);
244 // Apply activation function to floats.
245 if (params.activation != FusedActivationFunctionType::kNone)
247 // Apply activation function
248 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
253 inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams ¶ms,
254 const Shape &input_shape, const float *input_data,
255 const Shape &weights_shape, const float *weights_data,
256 const Shape &bias_shape, const float *bias_data,
257 const Shape &output_shape, float *output_data,
258 const uint16_t *w1_segments,
259 const uint16_t *w1_indices)
261 UNUSED_RELEASE(params);
262 UNUSED_RELEASE(input_shape);
264 assert(weights_shape.DimensionsCount() == 2);
265 assert(output_shape.DimensionsCount() == 2);
267 const int output_dims_count = output_shape.DimensionsCount();
268 const int weights_dims_count = weights_shape.DimensionsCount();
269 const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
270 const int output_depth =
271 MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
272 const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
274 UNUSED_RELEASE(bias_shape);
277 VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
281 ZeroVector(output_data, batches * output_depth);
283 for (int b = 0; b < batches; ++b)
285 for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
287 for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
289 int idx_1 = w1_indices[pw1];
290 output_data[b * output_depth + idx_0] +=
291 weights_data[pw1] * input_data[b * accum_depth + idx_1];
295 if (params.activation != FusedActivationFunctionType::kNone)
297 // Apply activation function
298 ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
305 #endif // __NNFW_CKER_FULLY_CONNECTED_H__