1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
7 #include "quantization.h"
9 void QuantizeAffine16(float *ptr_float_weights,
10 float *ptr_float_biases,
11 int16_t *ptr_int_weights,
12 int32_t *ptr_int_biases,
13 float input_scale_factor,
14 float *ptr_weight_scale_factor,
15 float *ptr_output_scale_factor,
18 uint32_t num_rows_padded,
19 uint32_t num_columns_padded) {
20 uint32_t num_saturate = 0;
22 if (*ptr_weight_scale_factor == 1.0) {
23 // scale factor for weights is not calculated yet
24 float mean_weight = 0.0;
25 float mean_weight_squared = 0.0;
26 float max_weight = -1e20f;
28 float mean_plus_2stdev;
30 for (uint32_t i = 0; i < num_rows; i++) {
31 for (uint32_t j = 0; j < num_columns; j++) {
32 float weight = ptr_float_weights[i * num_columns + j];
33 mean_weight += weight;
34 mean_weight_squared += weight * weight;
35 if (fabs(weight) > max_weight) {
36 max_weight = fabs(weight);
41 mean_weight /= static_cast<float>(num_rows * num_columns);
42 mean_weight_squared /= static_cast<float>(num_rows * num_columns);
43 var_weight = mean_weight_squared - mean_weight * mean_weight;
44 mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
46 *ptr_weight_scale_factor = static_cast<float>(MAX_VAL_2B_WEIGHT) / max_weight;
47 *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
50 for (uint32_t row = 0; row < num_rows; row++) {
51 for (uint32_t col = 0; col < num_columns; col++) {
52 float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
53 float value = ptr_float_weights[row * num_columns + col] * *ptr_weight_scale_factor + rounding_value;
54 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
55 if (value > 32767.0) {
56 *ptr_weight_16 = 32767;
58 } else if (value < -32768.0) {
59 *ptr_weight_16 = -32768;
62 *ptr_weight_16 = (int16_t) value;
65 for (uint32_t col = num_columns; col < num_columns_padded; col++) {
66 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
70 for (uint32_t row = num_rows; row < num_rows_padded; row++) {
71 for (uint32_t col = 0; col < num_columns_padded; col++) {
72 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
77 // case for element wise layer
78 if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) {
79 for (uint32_t j = 0; j < num_rows; j++) {
80 float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
81 float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
82 if (value > 2147483647.0) {
83 ptr_int_biases[j] = 2147483647L;
85 } else if (value < -2147483648.0) {
86 ptr_int_biases[j] = -2147483648LL;
89 ptr_int_biases[j] = (int32_t) value;
92 for (uint32_t j = num_rows; j < num_rows_padded; j++) {
93 ptr_int_biases[j] = 0;
97 if (num_saturate > 0) {
98 QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine16()\n",
100 num_rows * num_columns + num_rows);
104 void FixedQuantizeAffine16(float *ptr_float_weights,
105 float *ptr_float_biases,
106 int16_t *ptr_int_weights,
107 int32_t *ptr_int_biases,
108 float input_scale_factor,
109 float weight_scale_factor,
110 float *ptr_output_scale_factor,
112 uint32_t num_columns,
113 uint32_t num_rows_padded,
114 uint32_t num_columns_padded) {
115 uint32_t num_saturate = 0;
117 for (uint32_t row = 0; row < num_rows; row++) {
118 for (uint32_t col = 0; col < num_columns; col++) {
119 float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
120 float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
121 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
122 if (value > 32767.0) {
123 *ptr_weight_16 = 32767;
125 } else if (value < -32768.0) {
126 *ptr_weight_16 = -32768;
129 *ptr_weight_16 = (int16_t) value;
133 for (uint32_t row = num_rows; row < num_rows_padded; row++) {
134 for (uint32_t col = 0; col < num_columns_padded; col++) {
135 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
140 *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
142 for (uint32_t j = 0; j < num_rows; j++) {
143 float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
144 float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
145 if (value > 2147483647.0) {
146 ptr_int_biases[j] = 2147483647L;
148 } else if (value < -2147483648.0) {
149 ptr_int_biases[j] = -2147483648LL;
152 ptr_int_biases[j] = (int32_t) value;
155 for (uint32_t j = num_rows; j < num_rows_padded; j++) {
156 ptr_int_biases[j] = 0;
159 if (num_saturate > 0) {
160 QUANTWARNING("Warning: %d / %d saturations in FixedQuantizeAffine16()\n",
162 num_rows * num_columns + num_rows);
166 float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements) {
167 float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory);
171 for (size_t i = 0; i < num_elements; i++) {
172 if (fabs(ptr_float_feat[i]) > max) {
173 max = fabs(ptr_float_feat[i]);
180 scale_factor = target_max / max;
183 return (scale_factor);
186 float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors, float target_max) {
189 uint32_t num_vectors = (uint32_t) input_vectors.size();
191 for (uint32_t i = 0; i < num_vectors; i++) {
192 float *ptr_float_feat = input_vectors[i].data();
193 uint32_t num_elements = (uint32_t) input_vectors[i].size();
194 for (uint32_t j = 0; i < num_elements; i++) {
195 if (fabs(ptr_float_feat[j]) > max) {
196 max = fabs(ptr_float_feat[j]);
204 scale_factor = target_max / max;
207 return (scale_factor);
210 float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors,
216 uint32_t start_index = (uint32_t) index;
218 (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index
221 for (uint32_t i = start_index; i < end_index; i++) {
222 float *ptr_float_feat = input_vectors[i].data();
223 uint32_t num_elements = (uint32_t) input_vectors[i].size();
224 for (uint32_t j = 0; j < num_elements; j++) {
225 if (fabs(ptr_float_feat[j]) > max) {
226 max = fabs(ptr_float_feat[j]);
234 scale_factor = target_max / max;
237 return (scale_factor);
240 void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor) {
241 float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory);
242 uint32_t num_saturate = 0;
244 int16_t *ptr_int_feat = reinterpret_cast<int16_t *>(ptr_int_memory);
245 for (uint32_t i = 0; i < num_elements; i++) {
246 float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f;
247 float value = ptr_float_feat[i] * scale_factor + rounding_value;
248 if (value > 32767.0) {
249 ptr_int_feat[i] = 32767;
251 } else if (value < -32768.0) {
252 ptr_int_feat[i] = -32768;
255 ptr_int_feat[i] = (int16_t) value;
259 if (num_saturate > 0) {
260 QUANTWARNING("Warning: %d / %d saturations during QuantizeVector16()\n", num_saturate, num_elements);
264 void QuantizeVector16(std::vector<std::vector<float>> &input_vectors,
265 int16_t *ptr_int_memory,
267 uint32_t num_group_size,
268 float scale_factor) {
269 int16_t *ptr_int_feat = reinterpret_cast<int16_t *> (ptr_int_memory);
270 uint32_t num_saturate = 0;
271 uint32_t num_elements = (uint32_t) input_vectors[0].size(); // assume all vector are same size
272 uint32_t start_index = (uint32_t) index;
274 (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index
277 if (end_index - start_index < num_group_size) {
278 memset(ptr_int_feat, 0, num_elements * num_group_size * sizeof(int16_t)); // for zero padding partial group
280 for (uint32_t j = start_index; j < end_index; j++) {
281 for (uint32_t i = 0; i < num_elements; i++) {
282 float *ptr_float_feat = input_vectors[j].data();
283 float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f;
284 float value = ptr_float_feat[i] * scale_factor + rounding_value;
285 if (value > 32767.0) {
286 ptr_int_feat[i * num_group_size + j - start_index] = 32767;
288 } else if (value < -32768.0) {
289 ptr_int_feat[i * num_group_size + j - start_index] = -32768;
292 ptr_int_feat[i * num_group_size + j - start_index] = (int16_t) value;
296 if (num_saturate > 0) {
297 QUANTWARNING("Warning: %d / %d saturations during QuantizeVector16()\n",
299 num_elements * num_group_size);
303 void ReQuantizeVector16(int16_t *ptr_int_memory, uint32_t num_elements, float prev_scale_factor, float scale_factor) {
304 uint32_t num_saturate = 0;
306 int16_t *ptr_int_feat = reinterpret_cast<int16_t *> (ptr_int_memory);
307 for (uint32_t i = 0; i < num_elements; i++) {
308 float float_value = ptr_int_feat[i] / prev_scale_factor;
309 float rounding_value = (float_value > 0) ? 0.5f : -0.5f;
310 float value = float_value * scale_factor + rounding_value;
311 if (value > 32767.0) {
312 ptr_int_feat[i] = 32767;
314 } else if (value < -32768.0) {
315 ptr_int_feat[i] = -32768;
318 ptr_int_feat[i] = (int16_t) value;
322 if (num_saturate > 0) {
323 QUANTWARNING("Warning: %d / %d saturations during ReQuantizeVector16()\n", num_saturate, num_elements);
327 void QuantizeBias16(float *ptr_float_biases,
328 int32_t *ptr_int_biases,
329 float input_scale_factor,
330 float weight_scale_factor,
331 float *ptr_output_scale_factor,
333 uint32_t num_saturate = 0;
335 *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
336 for (uint32_t j = 0; j < num_rows; j++) {
337 float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
338 float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
339 if (value > 2147483647.0) {
340 ptr_int_biases[j] = 2147483647L;
342 } else if (value < -2147483648.0) {
343 ptr_int_biases[j] = -2147483648LL;
346 ptr_int_biases[j] = (int32_t) value;
350 if (num_saturate > 0) {
351 QUANTWARNING("Warning: %d / %d saturations in QuantizeBias16()\n", num_saturate, num_rows);
355 void DeQuantizeVector16(int16_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor) {
356 int16_t *int16_vector = reinterpret_cast<int16_t *> (ptr_int_memory);
357 for (uint32_t i = 0; i < float_vector.size(); i++) {
358 float_vector[i] = int16_vector[i] / scale_factor;
362 void DeQuantizeVector32(int32_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor) {
363 int32_t *int32_vector = reinterpret_cast<int32_t *> (ptr_int_memory);
364 for (uint32_t i = 0; i < float_vector.size(); i++) {
365 float_vector[i] = int32_vector[i] / scale_factor;
369 void DeQuantizeVector32(int32_t *ptr_int_memory,
370 std::vector<float> &float_vector,
372 uint32_t num_group_size,
373 float scale_factor) {
374 int32_t *int32_vector = reinterpret_cast<int32_t *> (ptr_int_memory);
375 for (uint32_t i = 0; i < float_vector.size(); i++) {
376 float_vector[i] = int32_vector[i * num_group_size + index] / scale_factor;
379 bool IntegrityCheckAffine16(float *ptr_float_weights,
380 float *ptr_float_biases,
381 int16_t *ptr_int_weights,
382 int32_t *ptr_int_biases,
383 float weight_scale_factor,
384 float output_scale_factor,
386 uint32_t num_columns,
387 uint32_t num_rows_padded,
388 uint32_t num_columns_padded) {
389 bool model_ok = true;
391 for (uint32_t row = 0; row < num_rows; row++) {
392 for (uint32_t col = 0; col < num_columns; col++) {
393 float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
394 float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
395 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
397 if (value > 32767.0) {
399 } else if (value < -32768.0) {
402 int_value = (int16_t) value;
404 if (int_value != *ptr_weight_16) {
408 for (uint32_t col = num_columns; col < num_columns_padded; col++) {
409 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
410 if (*ptr_weight_16 != 0) {
415 for (uint32_t row = num_rows; row < num_rows_padded; row++) {
416 for (uint32_t col = 0; col < num_columns_padded; col++) {
417 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
418 if (*ptr_weight_16 != 0) {
424 for (uint32_t j = 0; j < num_rows; j++) {
425 float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
426 float value = ptr_float_biases[j] * output_scale_factor + rounding_value;
428 if (value > 2147483647.0) {
429 int_value = 2147483647L;
430 } else if (value < -2147483648.0) {
431 int_value = -2147483648LL;
433 int_value = (int32_t) value;
435 if (int_value != ptr_int_biases[j]) {
439 for (uint32_t j = num_rows; j < num_rows_padded; j++) {
440 if (ptr_int_biases[j] != 0) {
448 bool IntegrityCheckAffineWeights16(float *ptr_float_weights,
449 int16_t *ptr_int_weights,
450 float weight_scale_factor,
452 uint32_t num_columns,
453 uint32_t num_rows_padded,
454 uint32_t num_columns_padded) {
455 bool model_ok = true;
457 for (uint32_t row = 0; row < num_rows; row++) {
458 for (uint32_t col = 0; col < num_columns; col++) {
459 float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
460 float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
461 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
463 if (value > 32767.0) {
465 } else if (value < -32768.0) {
468 int_value = (int16_t) value;
470 if (int_value != *ptr_weight_16) {
474 for (uint32_t col = num_columns; col < num_columns_padded; col++) {
475 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
476 if (*ptr_weight_16 != 0) {
481 for (uint32_t row = num_rows; row < num_rows_padded; row++) {
482 for (uint32_t col = 0; col < num_columns_padded; col++) {
483 int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
484 if (*ptr_weight_16 != 0) {
494 void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
495 int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
496 float input_scale_factor, float *ptr_weight_scale_factor,
497 float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
498 uint32_t num_rows_padded, uint32_t num_columns_padded) {
499 uint32_t num_saturate = 0;
501 if (*ptr_weight_scale_factor == 1.0) {
502 // scale factor for weights is not calculated yet
503 float mean_weight = 0.0;
504 float mean_weight_squared = 0.0;
505 float max_weight = -1e20f;
507 float mean_plus_2stdev;
509 for (uint32_t i = 0; i < num_rows; i++) {
510 for (uint32_t j = 0; j < num_columns; j++) {
511 float weight = ptr_float_weights[i*num_columns + j];
512 mean_weight += weight;
513 mean_weight_squared += weight * weight;
514 if (fabs(weight) > max_weight) {
515 max_weight = fabs(weight);
520 mean_weight /= static_cast<float>(num_rows * num_columns);
521 mean_weight_squared /= static_cast<float>(num_rows * num_columns);
522 var_weight = mean_weight_squared - mean_weight * mean_weight;
523 mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
525 *ptr_weight_scale_factor = static_cast<float>(MAX_VAL_1B_WEIGHT) / max_weight;
527 // For 8 bit weights quantize as follows:
528 // 1. adjust scale factor to increase dynamic range of entire matrix by max multiplier
529 // 2. find maximum scaled weight for each row
530 // 3. find multiplier such that dividing by the multiplier brings row back within 8-bit dynamic range
531 // 4. quantize and store scaled row
532 *ptr_weight_scale_factor = MAX_OUT_MULTIPLIER * *ptr_weight_scale_factor; // increase dynamic range by max multiplier
533 *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
535 float valueAcc = 0.0;
536 for (uint32_t row = 0; row < num_rows; row++) {
537 float scaled_row_max = 0;
538 float rounding_value, value;
539 for (uint32_t col = 0; col < num_columns; col++) {
540 value = ptr_float_weights[row*num_columns + col] * *ptr_weight_scale_factor;
542 if (fabs(value) > scaled_row_max) {
543 scaled_row_max = fabs(value);
547 value = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
548 ptr_int_biases[row].multiplier = (uint8_t) (value + 0.5);
549 for (uint32_t col = 0; col < num_columns; col++) {
550 int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
551 rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
554 value = ptr_float_weights[row*num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
558 } else if (value < -128.0) {
559 *ptr_weight_8 = -128;
562 *ptr_weight_8 = (int8_t)value;
565 for (uint32_t col = num_columns; col < num_columns_padded; col++) {
566 int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
570 for (uint32_t row = num_rows; row < num_rows_padded; row++) {
571 for (uint32_t col = 0; col < num_columns_padded; col++) {
572 int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
575 ptr_int_biases[row].multiplier = 0;
578 // bias value of the bas will be only used when input bias provided
579 if (ptr_float_biases != nullptr) {
580 for (uint32_t j = 0; j < num_rows; j++) {
581 float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
582 float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
583 if (value > 2147483647.0) {
584 ptr_int_biases[j].bias = 2147483647L;
586 } else if (value < -2147483648.0) {
587 ptr_int_biases[j].bias = -2147483648LL;
590 ptr_int_biases[j].bias = (int32_t) value;
595 if (num_saturate > 0) {
596 QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
601 void QuantizeBias8(float *ptr_float_biases,
602 intel_compound_bias_t *ptr_int_biases,
603 float input_scale_factor,
604 float weight_scale_factor,
605 float *ptr_output_scale_factor, uint32_t num_rows) {
606 uint32_t num_saturate = 0;
608 *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
609 for (uint32_t j = 0; j < num_rows; j++) {
610 float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
611 float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
612 if (value > 2147483647.0) {
613 ptr_int_biases[j].bias = 2147483647L;
615 } else if (value < -2147483648.0) {
616 ptr_int_biases[j].bias = -2147483648LL;
619 ptr_int_biases[j].bias = (int32_t)value;
623 if (num_saturate > 0) {
624 QUANTWARNING("Warning: %d / %d saturations in QuantizeBias8()\n", num_saturate, num_rows);
628 bool IntegrityCheckAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
629 float weight_scale_factor, float output_scale_factor, uint32_t num_rows, uint32_t num_columns,
630 uint32_t num_rows_padded, uint32_t num_columns_padded) {
631 bool model_ok = true;
633 for (uint32_t row = 0; row < num_rows; row++) {
634 float scaled_row_max = 0;
635 float rounding_value, value;
636 for (uint32_t col = 0; col < num_columns; col++) {
637 value = ptr_float_weights[row*num_columns + col] * weight_scale_factor;
638 if (fabs(value) > scaled_row_max) {
639 scaled_row_max = fabs(value);
642 value = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
643 if (ptr_int_biases[row].multiplier != (uint8_t)(value + 0.5)) {
646 for (uint32_t col = 0; col < num_columns; col++) {
647 int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
649 rounding_value = (ptr_float_weights[row*num_columns + col] > 0) ? 0.5f : -0.5f;
650 value = ptr_float_weights[row*num_columns + col] * (weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
653 } else if (value < -128.0) {
656 int_value = (int8_t)value;
658 if (int_value != *ptr_weight_8) {
662 for (uint32_t col = num_columns; col < num_columns_padded; col++) {
663 int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
664 if (*ptr_weight_8 != 0) {
669 for (uint32_t row = num_rows; row < num_rows_padded; row++) {
670 for (uint32_t col = 0; col < num_columns_padded; col++) {
671 int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
672 if (*ptr_weight_8 != 0) {
676 if (ptr_int_biases[row].multiplier != 0) {
681 for (uint32_t j = 0; j < num_rows; j++) {
682 float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
683 float value = ptr_float_biases[j] * output_scale_factor + rounding_value;
685 if (value > 2147483647.0) {
686 int_value = 2147483647L;
687 } else if (value < -2147483648.0) {
688 int_value = -2147483648LL;
690 int_value = (int32_t)value;
692 if (int_value != ptr_int_biases[j].bias) {