inference-engine/src/gna_plugin/quantization/quantization.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include <cstring>
   6 #include <iostream>
   7 #include "quantization.h"
   8
   9 void QuantizeAffine16(float *ptr_float_weights,
  10                       float *ptr_float_biases,
  11                       int16_t *ptr_int_weights,
  12                       int32_t *ptr_int_biases,
  13                       float input_scale_factor,
  14                       float *ptr_weight_scale_factor,
  15                       float *ptr_output_scale_factor,
  16                       uint32_t num_rows,
  17                       uint32_t num_columns,
  18                       uint32_t num_rows_padded,
  19                       uint32_t num_columns_padded) {
  20     uint32_t num_saturate = 0;
  21
  22     if (*ptr_weight_scale_factor == 1.0) {
  23         // scale factor for weights is not calculated yet
  24         float mean_weight = 0.0;
  25         float mean_weight_squared = 0.0;
  26         float max_weight = -1e20f;
  27         float var_weight;
  28         float mean_plus_2stdev;
  29
  30         for (uint32_t i = 0; i < num_rows; i++) {
  31             for (uint32_t j = 0; j < num_columns; j++) {
  32                 float weight = ptr_float_weights[i * num_columns + j];
  33                 mean_weight += weight;
  34                 mean_weight_squared += weight * weight;
  35                 if (fabs(weight) > max_weight) {
  36                     max_weight = fabs(weight);
  37                 }
  38             }
  39         }
  40
  41         mean_weight /= static_cast<float>(num_rows * num_columns);
  42         mean_weight_squared /= static_cast<float>(num_rows * num_columns);
  43         var_weight = mean_weight_squared - mean_weight * mean_weight;
  44         mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
  45
  46         *ptr_weight_scale_factor = static_cast<float>(MAX_VAL_2B_WEIGHT) / max_weight;
  47         *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
  48     }
  49
  50     for (uint32_t row = 0; row < num_rows; row++) {
  51         for (uint32_t col = 0; col < num_columns; col++) {
  52             float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
  53             float value = ptr_float_weights[row * num_columns + col] * *ptr_weight_scale_factor + rounding_value;
  54             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
  55             if (value > 32767.0) {
  56                 *ptr_weight_16 = 32767;
  57                 num_saturate++;
  58             } else if (value < -32768.0) {
  59                 *ptr_weight_16 = -32768;
  60                 num_saturate++;
  61             } else {
  62                 *ptr_weight_16 = (int16_t) value;
  63             }
  64         }
  65         for (uint32_t col = num_columns; col < num_columns_padded; col++) {
  66             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
  67             *ptr_weight_16 = 0;
  68         }
  69     }
  70     for (uint32_t row = num_rows; row < num_rows_padded; row++) {
  71         for (uint32_t col = 0; col < num_columns_padded; col++) {
  72             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
  73             *ptr_weight_16 = 0;
  74         }
  75     }
  76
  77     // case for element wise layer
  78     if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) {
  79         for (uint32_t j = 0; j < num_rows; j++) {
  80             float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
  81             float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
  82             if (value > 2147483647.0) {
  83                 ptr_int_biases[j] = 2147483647L;
  84                 num_saturate++;
  85             } else if (value < -2147483648.0) {
  86                 ptr_int_biases[j] = -2147483648LL;
  87                 num_saturate++;
  88             } else {
  89                 ptr_int_biases[j] = (int32_t) value;
  90             }
  91         }
  92         for (uint32_t j = num_rows; j < num_rows_padded; j++) {
  93             ptr_int_biases[j] = 0;
  94         }
  95     }
  96
  97     if (num_saturate > 0) {
  98         QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine16()\n",
  99                      num_saturate,
 100                      num_rows * num_columns + num_rows);
 101     }
 102 }
 103
 104 void FixedQuantizeAffine16(float *ptr_float_weights,
 105                            float *ptr_float_biases,
 106                            int16_t *ptr_int_weights,
 107                            int32_t *ptr_int_biases,
 108                            float input_scale_factor,
 109                            float weight_scale_factor,
 110                            float *ptr_output_scale_factor,
 111                            uint32_t num_rows,
 112                            uint32_t num_columns,
 113                            uint32_t num_rows_padded,
 114                            uint32_t num_columns_padded) {
 115     uint32_t num_saturate = 0;
 116
 117     for (uint32_t row = 0; row < num_rows; row++) {
 118         for (uint32_t col = 0; col < num_columns; col++) {
 119             float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
 120             float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
 121             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
 122             if (value > 32767.0) {
 123                 *ptr_weight_16 = 32767;
 124                 num_saturate++;
 125             } else if (value < -32768.0) {
 126                 *ptr_weight_16 = -32768;
 127                 num_saturate++;
 128             } else {
 129                 *ptr_weight_16 = (int16_t) value;
 130             }
 131         }
 132     }
 133     for (uint32_t row = num_rows; row < num_rows_padded; row++) {
 134         for (uint32_t col = 0; col < num_columns_padded; col++) {
 135             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
 136             *ptr_weight_16 = 0;
 137         }
 138     }
 139
 140     *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
 141
 142     for (uint32_t j = 0; j < num_rows; j++) {
 143         float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
 144         float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
 145         if (value > 2147483647.0) {
 146             ptr_int_biases[j] = 2147483647L;
 147             num_saturate++;
 148         } else if (value < -2147483648.0) {
 149             ptr_int_biases[j] = -2147483648LL;
 150             num_saturate++;
 151         } else {
 152             ptr_int_biases[j] = (int32_t) value;
 153         }
 154     }
 155     for (uint32_t j = num_rows; j < num_rows_padded; j++) {
 156         ptr_int_biases[j] = 0;
 157     }
 158
 159     if (num_saturate > 0) {
 160         QUANTWARNING("Warning:  %d / %d saturations in FixedQuantizeAffine16()\n",
 161                      num_saturate,
 162                      num_rows * num_columns + num_rows);
 163     }
 164 }
 165
 166 float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements) {
 167     float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory);
 168     float max = 0.0;
 169     float scale_factor;
 170
 171     for (size_t i = 0; i < num_elements; i++) {
 172         if (fabs(ptr_float_feat[i]) > max) {
 173             max = fabs(ptr_float_feat[i]);
 174         }
 175     }
 176
 177     if (max == 0) {
 178         scale_factor = 1.0;
 179     } else {
 180         scale_factor = target_max / max;
 181     }
 182
 183     return (scale_factor);
 184 }
 185
 186 float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors, float target_max) {
 187     float max = 0.0;
 188     float scale_factor;
 189     uint32_t num_vectors = (uint32_t) input_vectors.size();
 190
 191     for (uint32_t i = 0; i < num_vectors; i++) {
 192         float *ptr_float_feat = input_vectors[i].data();
 193         uint32_t num_elements = (uint32_t) input_vectors[i].size();
 194         for (uint32_t j = 0; i < num_elements; i++) {
 195             if (fabs(ptr_float_feat[j]) > max) {
 196                 max = fabs(ptr_float_feat[j]);
 197             }
 198         }
 199     }
 200
 201     if (max == 0) {
 202         scale_factor = 1.0;
 203     } else {
 204         scale_factor = target_max / max;
 205     }
 206
 207     return (scale_factor);
 208 }
 209
 210 float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors,
 211                                  int index,
 212                                  int num_group_size,
 213                                  float target_max) {
 214     float max = 0.0;
 215     float scale_factor;
 216     uint32_t start_index = (uint32_t) index;
 217     uint32_t end_index =
 218         (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index
 219             + num_group_size);
 220
 221     for (uint32_t i = start_index; i < end_index; i++) {
 222         float *ptr_float_feat = input_vectors[i].data();
 223         uint32_t num_elements = (uint32_t) input_vectors[i].size();
 224         for (uint32_t j = 0; j < num_elements; j++) {
 225             if (fabs(ptr_float_feat[j]) > max) {
 226                 max = fabs(ptr_float_feat[j]);
 227             }
 228         }
 229     }
 230
 231     if (max == 0) {
 232         scale_factor = 1.0;
 233     } else {
 234         scale_factor = target_max / max;
 235     }
 236
 237     return (scale_factor);
 238 }
 239
 240 void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor) {
 241     float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory);
 242     uint32_t num_saturate = 0;
 243
 244     int16_t *ptr_int_feat = reinterpret_cast<int16_t *>(ptr_int_memory);
 245     for (uint32_t i = 0; i < num_elements; i++) {
 246         float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f;
 247         float value = ptr_float_feat[i] * scale_factor + rounding_value;
 248         if (value > 32767.0) {
 249             ptr_int_feat[i] = 32767;
 250             num_saturate++;
 251         } else if (value < -32768.0) {
 252             ptr_int_feat[i] = -32768;
 253             num_saturate++;
 254         } else {
 255             ptr_int_feat[i] = (int16_t) value;
 256         }
 257     }
 258
 259     if (num_saturate > 0) {
 260         QUANTWARNING("Warning:  %d / %d saturations during QuantizeVector16()\n", num_saturate, num_elements);
 261     }
 262 }
 263
 264 void QuantizeVector16(std::vector<std::vector<float>> &input_vectors,
 265                       int16_t *ptr_int_memory,
 266                       uint32_t index,
 267                       uint32_t num_group_size,
 268                       float scale_factor) {
 269     int16_t *ptr_int_feat = reinterpret_cast<int16_t *> (ptr_int_memory);
 270     uint32_t num_saturate = 0;
 271     uint32_t num_elements = (uint32_t) input_vectors[0].size();  // assume all vector are same size
 272     uint32_t start_index = (uint32_t) index;
 273     uint32_t end_index =
 274         (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index
 275             + num_group_size);
 276
 277     if (end_index - start_index < num_group_size) {
 278         memset(ptr_int_feat, 0, num_elements * num_group_size * sizeof(int16_t));  // for zero padding partial group
 279     }
 280     for (uint32_t j = start_index; j < end_index; j++) {
 281         for (uint32_t i = 0; i < num_elements; i++) {
 282             float *ptr_float_feat = input_vectors[j].data();
 283             float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f;
 284             float value = ptr_float_feat[i] * scale_factor + rounding_value;
 285             if (value > 32767.0) {
 286                 ptr_int_feat[i * num_group_size + j - start_index] = 32767;
 287                 num_saturate++;
 288             } else if (value < -32768.0) {
 289                 ptr_int_feat[i * num_group_size + j - start_index] = -32768;
 290                 num_saturate++;
 291             } else {
 292                 ptr_int_feat[i * num_group_size + j - start_index] = (int16_t) value;
 293             }
 294         }
 295     }
 296     if (num_saturate > 0) {
 297         QUANTWARNING("Warning:  %d / %d saturations during QuantizeVector16()\n",
 298                      num_saturate,
 299                      num_elements * num_group_size);
 300     }
 301 }
 302
 303 void ReQuantizeVector16(int16_t *ptr_int_memory, uint32_t num_elements, float prev_scale_factor, float scale_factor) {
 304     uint32_t num_saturate = 0;
 305
 306     int16_t *ptr_int_feat = reinterpret_cast<int16_t *> (ptr_int_memory);
 307     for (uint32_t i = 0; i < num_elements; i++) {
 308         float float_value = ptr_int_feat[i] / prev_scale_factor;
 309         float rounding_value = (float_value > 0) ? 0.5f : -0.5f;
 310         float value = float_value * scale_factor + rounding_value;
 311         if (value > 32767.0) {
 312             ptr_int_feat[i] = 32767;
 313             num_saturate++;
 314         } else if (value < -32768.0) {
 315             ptr_int_feat[i] = -32768;
 316             num_saturate++;
 317         } else {
 318             ptr_int_feat[i] = (int16_t) value;
 319         }
 320     }
 321
 322     if (num_saturate > 0) {
 323         QUANTWARNING("Warning:  %d / %d saturations during ReQuantizeVector16()\n", num_saturate, num_elements);
 324     }
 325 }
 326
 327 void QuantizeBias16(float *ptr_float_biases,
 328                     int32_t *ptr_int_biases,
 329                     float input_scale_factor,
 330                     float weight_scale_factor,
 331                     float *ptr_output_scale_factor,
 332                     uint32_t num_rows) {
 333     uint32_t num_saturate = 0;
 334
 335     *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
 336     for (uint32_t j = 0; j < num_rows; j++) {
 337         float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
 338         float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
 339         if (value > 2147483647.0) {
 340             ptr_int_biases[j] = 2147483647L;
 341             num_saturate++;
 342         } else if (value < -2147483648.0) {
 343             ptr_int_biases[j] = -2147483648LL;
 344             num_saturate++;
 345         } else {
 346             ptr_int_biases[j] = (int32_t) value;
 347         }
 348     }
 349
 350     if (num_saturate > 0) {
 351         QUANTWARNING("Warning:  %d / %d saturations in QuantizeBias16()\n", num_saturate, num_rows);
 352     }
 353 }
 354
 355 void DeQuantizeVector16(int16_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor) {
 356     int16_t *int16_vector = reinterpret_cast<int16_t *> (ptr_int_memory);
 357     for (uint32_t i = 0; i < float_vector.size(); i++) {
 358         float_vector[i] = int16_vector[i] / scale_factor;
 359     }
 360 }
 361
 362 void DeQuantizeVector32(int32_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor) {
 363     int32_t *int32_vector = reinterpret_cast<int32_t  *> (ptr_int_memory);
 364     for (uint32_t i = 0; i < float_vector.size(); i++) {
 365         float_vector[i] = int32_vector[i] / scale_factor;
 366     }
 367 }
 368
 369 void DeQuantizeVector32(int32_t *ptr_int_memory,
 370                         std::vector<float> &float_vector,
 371                         uint32_t index,
 372                         uint32_t num_group_size,
 373                         float scale_factor) {
 374     int32_t *int32_vector = reinterpret_cast<int32_t  *> (ptr_int_memory);
 375     for (uint32_t i = 0; i < float_vector.size(); i++) {
 376         float_vector[i] = int32_vector[i * num_group_size + index] / scale_factor;
 377     }
 378 }
 379 bool IntegrityCheckAffine16(float *ptr_float_weights,
 380                             float *ptr_float_biases,
 381                             int16_t *ptr_int_weights,
 382                             int32_t *ptr_int_biases,
 383                             float weight_scale_factor,
 384                             float output_scale_factor,
 385                             uint32_t num_rows,
 386                             uint32_t num_columns,
 387                             uint32_t num_rows_padded,
 388                             uint32_t num_columns_padded) {
 389     bool model_ok = true;
 390
 391     for (uint32_t row = 0; row < num_rows; row++) {
 392         for (uint32_t col = 0; col < num_columns; col++) {
 393             float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
 394             float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
 395             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
 396             int16_t int_value;
 397             if (value > 32767.0) {
 398                 int_value = 32767;
 399             } else if (value < -32768.0) {
 400                 int_value = -32768;
 401             } else {
 402                 int_value = (int16_t) value;
 403             }
 404             if (int_value != *ptr_weight_16) {
 405                 model_ok = false;
 406             }
 407         }
 408         for (uint32_t col = num_columns; col < num_columns_padded; col++) {
 409             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
 410             if (*ptr_weight_16 != 0) {
 411                 model_ok = false;
 412             }
 413         }
 414     }
 415     for (uint32_t row = num_rows; row < num_rows_padded; row++) {
 416         for (uint32_t col = 0; col < num_columns_padded; col++) {
 417             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
 418             if (*ptr_weight_16 != 0) {
 419                 model_ok = false;
 420             }
 421         }
 422     }
 423
 424     for (uint32_t j = 0; j < num_rows; j++) {
 425         float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
 426         float value = ptr_float_biases[j] * output_scale_factor + rounding_value;
 427         int32_t int_value;
 428         if (value > 2147483647.0) {
 429             int_value = 2147483647L;
 430         } else if (value < -2147483648.0) {
 431             int_value = -2147483648LL;
 432         } else {
 433             int_value = (int32_t) value;
 434         }
 435         if (int_value != ptr_int_biases[j]) {
 436             model_ok = false;
 437         }
 438     }
 439     for (uint32_t j = num_rows; j < num_rows_padded; j++) {
 440         if (ptr_int_biases[j] != 0) {
 441             model_ok = false;
 442         }
 443     }
 444
 445     return (model_ok);
 446 }
 447
 448 bool IntegrityCheckAffineWeights16(float *ptr_float_weights,
 449                                    int16_t *ptr_int_weights,
 450                                    float weight_scale_factor,
 451                                    uint32_t num_rows,
 452                                    uint32_t num_columns,
 453                                    uint32_t num_rows_padded,
 454                                    uint32_t num_columns_padded) {
 455     bool model_ok = true;
 456
 457     for (uint32_t row = 0; row < num_rows; row++) {
 458         for (uint32_t col = 0; col < num_columns; col++) {
 459             float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
 460             float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
 461             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
 462             int16_t int_value;
 463             if (value > 32767.0) {
 464                 int_value = 32767;
 465             } else if (value < -32768.0) {
 466                 int_value = -32768;
 467             } else {
 468                 int_value = (int16_t) value;
 469             }
 470             if (int_value != *ptr_weight_16) {
 471                 model_ok = false;
 472             }
 473         }
 474         for (uint32_t col = num_columns; col < num_columns_padded; col++) {
 475             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
 476             if (*ptr_weight_16 != 0) {
 477                 model_ok = false;
 478             }
 479         }
 480     }
 481     for (uint32_t row = num_rows; row < num_rows_padded; row++) {
 482         for (uint32_t col = 0; col < num_columns_padded; col++) {
 483             int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
 484             if (*ptr_weight_16 != 0) {
 485                 model_ok = false;
 486             }
 487         }
 488     }
 489
 490     return (model_ok);
 491 }
 492
 493
 494 void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
 495                      int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
 496                      float input_scale_factor, float *ptr_weight_scale_factor,
 497                      float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
 498                      uint32_t num_rows_padded, uint32_t num_columns_padded) {
 499     uint32_t num_saturate = 0;
 500
 501     if (*ptr_weight_scale_factor == 1.0) {
 502         // scale factor for weights is not calculated yet
 503         float mean_weight = 0.0;
 504         float mean_weight_squared = 0.0;
 505         float max_weight = -1e20f;
 506         float var_weight;
 507         float mean_plus_2stdev;
 508
 509         for (uint32_t i = 0; i < num_rows; i++) {
 510             for (uint32_t j = 0; j < num_columns; j++) {
 511                 float weight = ptr_float_weights[i*num_columns + j];
 512                 mean_weight += weight;
 513                 mean_weight_squared += weight * weight;
 514                 if (fabs(weight) > max_weight) {
 515                     max_weight = fabs(weight);
 516                 }
 517             }
 518         }
 519
 520         mean_weight /= static_cast<float>(num_rows * num_columns);
 521         mean_weight_squared /= static_cast<float>(num_rows * num_columns);
 522         var_weight = mean_weight_squared - mean_weight * mean_weight;
 523         mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
 524
 525         *ptr_weight_scale_factor = static_cast<float>(MAX_VAL_1B_WEIGHT) / max_weight;
 526
 527         // For 8 bit weights quantize as follows:
 528         // 1. adjust scale factor to increase dynamic range of entire matrix by max multiplier
 529         // 2. find maximum scaled weight for each row
 530         // 3. find multiplier such that dividing by the multiplier brings row back within 8-bit dynamic range
 531         // 4. quantize and store scaled row
 532         *ptr_weight_scale_factor = MAX_OUT_MULTIPLIER * *ptr_weight_scale_factor;  //  increase dynamic range by max multiplier
 533         *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
 534     }
 535     float valueAcc = 0.0;
 536     for (uint32_t row = 0; row < num_rows; row++) {
 537         float scaled_row_max = 0;
 538         float rounding_value, value;
 539         for (uint32_t col = 0; col < num_columns; col++) {
 540             value = ptr_float_weights[row*num_columns + col] * *ptr_weight_scale_factor;
 541             valueAcc += value;
 542             if (fabs(value) > scaled_row_max) {
 543                 scaled_row_max = fabs(value);
 544             }
 545         }
 546
 547         value = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
 548         ptr_int_biases[row].multiplier = (uint8_t) (value + 0.5);
 549         for (uint32_t col = 0; col < num_columns; col++) {
 550             int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
 551             rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
 552
 553
 554             value = ptr_float_weights[row*num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
 555             if (value > 127.0) {
 556                 *ptr_weight_8 = 127;
 557                 num_saturate++;
 558             } else if (value < -128.0) {
 559                 *ptr_weight_8 = -128;
 560                 num_saturate++;
 561             } else {
 562                 *ptr_weight_8 = (int8_t)value;
 563             }
 564         }
 565         for (uint32_t col = num_columns; col < num_columns_padded; col++) {
 566             int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
 567             *ptr_weight_8 = 0;
 568         }
 569     }
 570     for (uint32_t row = num_rows; row < num_rows_padded; row++) {
 571         for (uint32_t col = 0; col < num_columns_padded; col++) {
 572             int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
 573             *ptr_weight_8 = 0;
 574         }
 575         ptr_int_biases[row].multiplier = 0;
 576     }
 577
 578     // bias value of the bas will be only used when input bias provided
 579     if (ptr_float_biases != nullptr) {
 580         for (uint32_t j = 0; j < num_rows; j++) {
 581             float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
 582             float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
 583             if (value > 2147483647.0) {
 584                 ptr_int_biases[j].bias = 2147483647L;
 585                 num_saturate++;
 586             } else if (value < -2147483648.0) {
 587                 ptr_int_biases[j].bias = -2147483648LL;
 588                 num_saturate++;
 589             } else {
 590                 ptr_int_biases[j].bias = (int32_t) value;
 591             }
 592         }
 593     }
 594
 595     if (num_saturate > 0) {
 596         QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
 597     }
 598 }
 599
 600
 601 void QuantizeBias8(float *ptr_float_biases,
 602                    intel_compound_bias_t  *ptr_int_biases,
 603                    float input_scale_factor,
 604                    float weight_scale_factor,
 605                    float *ptr_output_scale_factor, uint32_t num_rows) {
 606     uint32_t num_saturate = 0;
 607
 608     *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
 609     for (uint32_t j = 0; j < num_rows; j++) {
 610         float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
 611         float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
 612         if (value > 2147483647.0) {
 613             ptr_int_biases[j].bias = 2147483647L;
 614             num_saturate++;
 615         } else if (value < -2147483648.0) {
 616             ptr_int_biases[j].bias = -2147483648LL;
 617             num_saturate++;
 618         } else {
 619             ptr_int_biases[j].bias = (int32_t)value;
 620         }
 621     }
 622
 623     if (num_saturate > 0) {
 624         QUANTWARNING("Warning:  %d / %d saturations in QuantizeBias8()\n", num_saturate, num_rows);
 625     }
 626 }
 627
 628 bool IntegrityCheckAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
 629                            float weight_scale_factor, float output_scale_factor, uint32_t num_rows, uint32_t num_columns,
 630                            uint32_t num_rows_padded, uint32_t num_columns_padded) {
 631     bool model_ok = true;
 632
 633     for (uint32_t row = 0; row < num_rows; row++) {
 634         float scaled_row_max = 0;
 635         float rounding_value, value;
 636         for (uint32_t col = 0; col < num_columns; col++) {
 637             value = ptr_float_weights[row*num_columns + col] * weight_scale_factor;
 638             if (fabs(value) > scaled_row_max) {
 639                 scaled_row_max = fabs(value);
 640             }
 641         }
 642         value = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
 643         if (ptr_int_biases[row].multiplier != (uint8_t)(value + 0.5)) {
 644             model_ok = false;
 645         }
 646         for (uint32_t col = 0; col < num_columns; col++) {
 647             int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
 648             int8_t int_value;
 649             rounding_value = (ptr_float_weights[row*num_columns + col] > 0) ? 0.5f : -0.5f;
 650             value = ptr_float_weights[row*num_columns + col] * (weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
 651             if (value > 127.0) {
 652                 int_value = 127;
 653             } else if (value < -128.0) {
 654                 int_value = -128;
 655             } else {
 656                 int_value = (int8_t)value;
 657             }
 658             if (int_value != *ptr_weight_8) {
 659                 model_ok = false;
 660             }
 661         }
 662         for (uint32_t col = num_columns; col < num_columns_padded; col++) {
 663             int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
 664             if (*ptr_weight_8 != 0) {
 665                 model_ok = false;
 666             }
 667         }
 668     }
 669     for (uint32_t row = num_rows; row < num_rows_padded; row++) {
 670         for (uint32_t col = 0; col < num_columns_padded; col++) {
 671             int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
 672             if (*ptr_weight_8 != 0) {
 673                 model_ok = false;
 674             }
 675         }
 676         if (ptr_int_biases[row].multiplier != 0) {
 677             model_ok = false;
 678         }
 679     }
 680
 681     for (uint32_t j = 0; j < num_rows; j++) {
 682         float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
 683         float value = ptr_float_biases[j] * output_scale_factor + rounding_value;
 684         int32_t int_value;
 685         if (value > 2147483647.0) {
 686             int_value = 2147483647L;
 687         } else if (value < -2147483648.0) {
 688             int_value = -2147483648LL;
 689         } else {
 690             int_value = (int32_t)value;
 691         }
 692         if (int_value != ptr_int_biases[j].bias) {
 693             model_ok = false;
 694         }
 695     }
 696
 697     return(model_ok);
 698 }
 699