compiler/luci/pass/src/QuantizeWeights.cpp

   1 /*
   2  * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *    http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "QuantizeWeights.h"
  18 #include "QuantizationUtils.h"
  19
  20 #include <luci/Service/Nodes/CircleConst.h>
  21 #include <luci/Log.h>
  22
  23 #include <cmath>
  24 #include <vector>
  25 #include <functional>
  26 #include <limits>
  27
  28 using namespace luci;
  29
  30 namespace
  31 {
  32
  33 using IterFunc = std::function<void(uint32_t *, loco::TensorShape &, int32_t)>;
  34
  35 void iterate_per_channel(CircleConst *node, int32_t &channel_dim_index, IterFunc func)
  36 {
  37   loco::TensorShape dimension;
  38   dimension.rank(4);
  39   uint32_t indices[4] = {
  40     0,
  41   };
  42
  43   if (!get_channel_dim_index(node, dimension, channel_dim_index))
  44   {
  45     assert(false);
  46     return;
  47   }
  48
  49   for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
  50   {
  51     for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
  52     {
  53       for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
  54       {
  55         for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
  56         {
  57           func(indices, dimension, channel_dim_index);
  58         }
  59       }
  60     }
  61   }
  62 }
  63
  64 void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
  65                              std::vector<float> &scaling_factor, int32_t &channel_dim_index)
  66 {
  67   assert(node->dtype() == loco::DataType::FLOAT32);
  68
  69   const int32_t kMinScale = 0;
  70   const int32_t kMaxScale = 255;
  71
  72   uint32_t size = node->size<loco::DataType::FLOAT32>();
  73   std::vector<int32_t> quantized_values(size);
  74
  75   auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int32_t channel_dim_index) {
  76     int channel_idx = indices[channel_dim_index];
  77     const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
  78     auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
  79     quantized_values[cal_offset(dimension, indices)] =
  80       static_cast<int32_t>(std::round((data - min[channel_idx]) * scaling_factor_inv));
  81   };
  82
  83   iterate_per_channel(node, channel_dim_index, quantize);
  84
  85   node->dtype(loco::DataType::U8);      // change the type of tensor
  86   node->size<loco::DataType::U8>(size); // resize tensor
  87   for (uint32_t i = 0; i < size; ++i)
  88   {
  89     node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
  90   }
  91 }
  92
  93 // TODO Reduce duplicate code with QuantizeDequantizeWeights
  94 void sym_wquant_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max,
  95                             std::vector<float> &scaling_factor, std::vector<float> &nudged_min,
  96                             std::vector<float> &nudged_max, int32_t &channel_dim_index)
  97 {
  98   assert(node->dtype() == loco::DataType::FLOAT32);
  99   const int32_t kMaxScale = std::numeric_limits<int16_t>::max();
 100   const int32_t kMinScale = -kMaxScale;
 101
 102   uint32_t size = node->size<loco::DataType::FLOAT32>();
 103   std::vector<int32_t> quantized_values(size);
 104
 105   for (size_t i = 0; i < min.size(); ++i)
 106   {
 107     compute_sym_scale(min[i], max[i], scaling_factor[i], nudged_min[i], nudged_max[i]);
 108   }
 109
 110   auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) {
 111     int channel_idx = indices[channel_dim_index];
 112     const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
 113     auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
 114     data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
 115     data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
 116     quantized_values[cal_offset(dimension, indices)] =
 117       static_cast<int32_t>(std::round(data * scaling_factor_inv));
 118   };
 119
 120   iterate_per_channel(node, channel_dim_index, quantize);
 121
 122   node->dtype(loco::DataType::S16);      // change the type of tensor
 123   node->size<loco::DataType::S16>(size); // resize tensor
 124   for (uint32_t i = 0; i < size; ++i)
 125   {
 126     node->at<loco::DataType::S16>(i) =
 127       std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
 128   }
 129 }
 130
 131 void cal_minmax_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max,
 132                             int32_t &channel_dim_index)
 133 {
 134   loco::TensorShape dimension;
 135   dimension.rank(4);
 136
 137   if (!get_channel_dim_index(node, dimension, channel_dim_index))
 138   {
 139     throw std::runtime_error("Failed to find channel index in " + node->name());
 140   }
 141   auto size = dimension.dim(channel_dim_index).value();
 142
 143   std::vector<bool> has_min_max_value(size, false);
 144   min.resize(size);
 145   max.resize(size);
 146
 147   auto cal_minmax = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) {
 148     int channel_idx = indices[channel_dim_index];
 149     auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
 150     if (has_min_max_value[channel_idx])
 151     {
 152       min[channel_idx] = data < min[channel_idx] ? data : min[channel_idx];
 153       max[channel_idx] = data > max[channel_idx] ? data : max[channel_idx];
 154     }
 155     else
 156     {
 157       min[channel_idx] = data;
 158       max[channel_idx] = data;
 159       has_min_max_value[channel_idx] = true;
 160     }
 161   };
 162
 163   iterate_per_channel(node, channel_dim_index, cal_minmax);
 164 }
 165
 166 void asymmetric_wquant_per_channel(CircleConst *node, std::vector<float> &min,
 167                                    std::vector<float> &max, std::vector<float> &scaling_factor,
 168                                    std::vector<int64_t> &zp, std::vector<float> &nudged_min,
 169                                    std::vector<float> &nudged_max, int32_t &channel_dim_index)
 170 {
 171   assert(node->dtype() == loco::DataType::FLOAT32);
 172
 173   const int32_t kMinScale = 0;
 174   const int32_t kMaxScale = 255;
 175
 176   uint32_t size = node->size<loco::DataType::FLOAT32>();
 177   std::vector<int32_t> quantized_values(size);
 178
 179   for (size_t i = 0; i < min.size(); ++i)
 180   {
 181     compute_asym_scale_zp(min[i], max[i], scaling_factor[i], zp[i], nudged_min[i], nudged_max[i]);
 182   }
 183
 184   auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) {
 185     int channel_idx = indices[channel_dim_index];
 186     const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
 187     auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
 188     data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
 189     data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
 190     quantized_values[cal_offset(dimension, indices)] =
 191       static_cast<int32_t>(std::round((data - nudged_min[channel_idx]) * scaling_factor_inv));
 192   };
 193
 194   iterate_per_channel(node, channel_dim_index, quantize);
 195
 196   node->dtype(loco::DataType::U8);      // change the type of tensor
 197   node->size<loco::DataType::U8>(size); // resize tensor
 198   for (uint32_t i = 0; i < size; ++i)
 199   {
 200     node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
 201   }
 202 }
 203
 204 void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor,
 205                             int32_t &channel_dim_index)
 206 {
 207   assert(node->dtype() == loco::DataType::FLOAT32);
 208
 209   const int32_t kMaxScale = std::numeric_limits<int16_t>::max();
 210   const int32_t kMinScale = -kMaxScale;
 211
 212   uint32_t size = node->size<loco::DataType::FLOAT32>();
 213   std::vector<int32_t> quantized_values(size);
 214
 215   auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int32_t channel_dim_index) {
 216     int channel_idx = indices[channel_dim_index];
 217     const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
 218     auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
 219     quantized_values[cal_offset(dimension, indices)] =
 220       static_cast<int32_t>(std::round(data * scaling_factor_inv));
 221   };
 222
 223   iterate_per_channel(node, channel_dim_index, quantize);
 224
 225   node->dtype(loco::DataType::S16);      // change the type of tensor
 226   node->size<loco::DataType::S16>(size); // resize tensor
 227   for (uint32_t i = 0; i < size; ++i)
 228   {
 229     node->at<loco::DataType::S16>(i) =
 230       std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
 231   }
 232 }
 233
 234 void asym_wquant_per_layer(CircleConst *node, float min, float scaling_factor)
 235 {
 236   const int32_t kMinScale = 0;
 237   const int32_t kMaxScale = 255;
 238
 239   uint32_t size = node->size<loco::DataType::FLOAT32>();
 240
 241   const float scaling_factor_inv = 1.0 / scaling_factor;
 242   std::vector<int32_t> quantized_values(size);
 243   for (uint32_t i = 0; i < size; ++i)
 244   {
 245     auto data = node->at<loco::DataType::FLOAT32>(i);
 246     quantized_values[i] = static_cast<int32_t>(std::round((data - min) * scaling_factor_inv));
 247   }
 248
 249   node->dtype(loco::DataType::U8);      // change the type of tensor
 250   node->size<loco::DataType::U8>(size); // resize tensor
 251   for (uint32_t i = 0; i < size; ++i)
 252   {
 253     node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
 254   }
 255 }
 256
 257 // Quantize const per channel
 258 //
 259 // The last dimension of const is the same as the dimension of channel
 260 // And the rest of the const dimensions should be 1
 261 // So, a 'single value' is quantized per channel
 262 //
 263 // Quantization spec (f: fp value, q: quantized value)
 264 //
 265 // uint8
 266 //   Positive f: f = f * (q - 0) [q = 1, scale = f, zp = 0]
 267 //   Negative f: f = (-f) * (q - 1) [q = 0, scale = -f, zp = 1]
 268 //
 269 // int16
 270 //   Positive f: f = f * (q - 0) [q = 1, scale = f, zp = 0]
 271 //   Negative f: f = (-f) * (q - 0) [q = -1, scale = -f, zp = 0]
 272 void quant_const_per_channel(CircleConst *node, loco::DataType quant_type)
 273 {
 274   assert(node->dtype() == loco::DataType::FLOAT32);
 275   assert(node->rank() > 0);
 276
 277   for (uint32_t i = 0; i < node->rank() - 1; i++)
 278   {
 279     // Caller should call this function when the below condition is satisfied
 280     if (node->dim(i).value() != 1)
 281       throw std::runtime_error("Non-channel dimension of const node must be 1");
 282   }
 283
 284   uint32_t size = node->size<loco::DataType::FLOAT32>();
 285   assert(size == node->dim(node->rank() - 1).value());
 286
 287   auto quantparam = std::make_unique<CircleQuantParam>();
 288   quantparam->quantized_dimension = node->rank() - 1;
 289   std::vector<int32_t> quantized_data(size);
 290
 291   for (uint32_t i = 0; i < size; ++i)
 292   {
 293     auto data = node->at<loco::DataType::FLOAT32>(i);
 294     if (quant_type == loco::DataType::U8)
 295     {
 296       if (data >= 0)
 297       {
 298         quantparam->scale.push_back(data);
 299         quantparam->zerop.push_back(0);
 300         quantized_data[i] = 1;
 301       }
 302       else
 303       {
 304         quantparam->scale.push_back(-data);
 305         quantparam->zerop.push_back(1);
 306         quantized_data[i] = 0;
 307       }
 308     }
 309     else if (quant_type == loco::DataType::S16)
 310     {
 311       if (data >= 0)
 312       {
 313         quantparam->scale.push_back(data);
 314         quantized_data[i] = 1;
 315       }
 316       else
 317       {
 318         quantparam->scale.push_back(-data);
 319         quantized_data[i] = -1;
 320       }
 321       quantparam->zerop.push_back(0);
 322     }
 323   }
 324   node->quantparam(std::move(quantparam));
 325
 326   switch (quant_type)
 327   {
 328     case loco::DataType::U8:
 329       node->dtype(loco::DataType::U8);
 330       node->size<loco::DataType::U8>(size);
 331       for (uint32_t i = 0; i < size; ++i)
 332       {
 333         assert(quantized_data[i] == 0 || quantized_data[i] == 1);
 334         node->at<loco::DataType::U8>(i) = quantized_data[i];
 335       }
 336       break;
 337     case loco::DataType::S16:
 338       node->dtype(loco::DataType::S16);
 339       node->size<loco::DataType::S16>(size);
 340       for (uint32_t i = 0; i < size; ++i)
 341       {
 342         assert(quantized_data[i] == -1 || quantized_data[i] == 1);
 343         node->at<loco::DataType::S16>(i) = quantized_data[i];
 344       }
 345       break;
 346     default:
 347       throw std::runtime_error("Unsupported data type");
 348   }
 349 }
 350
 351 } // namespace
 352
 353 namespace luci
 354 {
 355
 356 void QuantizeWeights::quantize_weights(luci::CircleConst *weights)
 357 {
 358   // Find min/max per channel-wise
 359   if (granularity == QuantizationGranularity::ChannelWise)
 360   {
 361     auto quantparam = weights->quantparam();
 362     if (quantparam == nullptr)
 363     {
 364       // Find min/max on the fly
 365       // NOTE This is for the case when QuantizeDequantizeWeights is skipped
 366       // TODO Reduce duplicate codes
 367       std::vector<float> min;
 368       std::vector<float> max;
 369       int32_t channel_dim_index = 0;
 370
 371       cal_minmax_per_channel(weights, min, max, channel_dim_index);
 372
 373       std::vector<float> nudged_min(min.size());
 374       std::vector<float> nudged_max(min.size());
 375       std::vector<float> scaling_factor(min.size());
 376       std::vector<int64_t> zp(min.size());
 377
 378       if (output_type == loco::DataType::U8)
 379       {
 380         asymmetric_wquant_per_channel(weights, min, max, scaling_factor, zp, nudged_min, nudged_max,
 381                                       channel_dim_index);
 382       }
 383       else
 384       {
 385         sym_wquant_per_channel(weights, min, max, scaling_factor, nudged_min, nudged_max,
 386                                channel_dim_index);
 387       }
 388
 389       auto quantparam = std::make_unique<CircleQuantParam>();
 390       quantparam->scale = scaling_factor;
 391       quantparam->zerop = zp;
 392       quantparam->quantized_dimension = channel_dim_index;
 393       weights->quantparam(std::move(quantparam));
 394
 395       return;
 396     }
 397
 398     auto min = quantparam->min;
 399     auto scaling_factor = quantparam->scale;
 400     int32_t channel_dim_index = 0;
 401
 402     if (output_type == loco::DataType::U8)
 403     {
 404       asym_wquant_per_channel(weights, min, scaling_factor, channel_dim_index);
 405     }
 406     else
 407     {
 408       sym_wquant_per_channel(weights, scaling_factor, channel_dim_index);
 409     }
 410     quantparam->min.clear();
 411     quantparam->max.clear();
 412     quantparam->quantized_dimension = channel_dim_index;
 413   }
 414   // Find min/max per layer-wise
 415   else
 416   {
 417     auto quantparam = weights->quantparam();
 418     if (quantparam == nullptr)
 419     {
 420       // Find min/max on the fly
 421       // NOTE This is for the case when QuantizeDequantizeWeights is skipped
 422       // TODO Reduce duplicate codes
 423       float min = std::numeric_limits<float>::max();
 424       float max = std::numeric_limits<float>::lowest();
 425       for (uint32_t i = 0; i < weights->size<loco::DataType::FLOAT32>(); i++)
 426       {
 427         auto data = weights->at<loco::DataType::FLOAT32>(i);
 428         min = data < min ? data : min;
 429         max = data > max ? data : max;
 430       }
 431       float scaling_factor{0};
 432       int64_t zp{0};
 433       float nudged_min{0};
 434       float nudged_max{0};
 435
 436       asymmetric_wquant_with_minmax_per_layer(weights, min, max, scaling_factor, zp, nudged_min,
 437                                               nudged_max);
 438       auto quantparam = std::make_unique<CircleQuantParam>();
 439       quantparam->scale.push_back(scaling_factor);
 440       quantparam->zerop.push_back(zp);
 441       weights->quantparam(std::move(quantparam));
 442       return;
 443     }
 444
 445     // Quantize using recorded quantparam
 446     assert(quantparam != nullptr);
 447     assert(quantparam->min.size() == 1);   // only support layer-wise quant
 448     assert(quantparam->scale.size() == 1); // only support layer-wise quant
 449     auto min = quantparam->min[0];
 450     auto scaling_factor = quantparam->scale[0];
 451     asym_wquant_per_layer(weights, min, scaling_factor);
 452     quantparam->min.clear();
 453     quantparam->max.clear();
 454   }
 455 }
 456 void QuantizeWeights::visit(luci::CircleConv2D *node)
 457 {
 458   LOGGER(l);
 459   INFO(l) << "QuantizeWeights QuantizeWeights::visit node: " << node->name() << std::endl;
 460
 461   auto weights = loco::must_cast<luci::CircleConst *>(node->filter());
 462   if (!is_quantized(weights))
 463   {
 464     auto new_weights = luci::clone(weights);
 465     node->filter(new_weights);
 466     quantize_weights(new_weights);
 467   }
 468 }
 469
 470 void QuantizeWeights::visit(luci::CircleDepthwiseConv2D *node)
 471 {
 472   LOGGER(l);
 473   INFO(l) << "QuantizeWeights QuantizeWeights::visit node: " << node->name() << std::endl;
 474
 475   auto weights = loco::must_cast<luci::CircleConst *>(node->filter());
 476   if (!is_quantized(weights))
 477   {
 478     auto new_weights = luci::clone(weights);
 479     node->filter(new_weights);
 480     quantize_weights(new_weights);
 481   }
 482 }
 483
 484 void QuantizeWeights::visit(luci::CircleInstanceNorm *node)
 485 {
 486   LOGGER(l);
 487   INFO(l) << "QuantizeWeights QuantizeWeights::visit node: " << node->name() << std::endl;
 488
 489   auto gamma = loco::must_cast<luci::CircleConst *>(node->gamma());
 490   auto beta = loco::must_cast<luci::CircleConst *>(node->beta());
 491
 492   if (!is_quantized(gamma))
 493   {
 494     assert(gamma->dtype() == loco::DataType::FLOAT32);
 495     auto new_gamma = luci::clone(gamma);
 496     if (granularity == QuantizationGranularity::LayerWise)
 497       quant_const(new_gamma, output_type);
 498     else if (granularity == QuantizationGranularity::ChannelWise)
 499       quant_const_per_channel(new_gamma, output_type);
 500     node->gamma(new_gamma);
 501   }
 502   if (!is_quantized(beta))
 503   {
 504     assert(beta->dtype() == loco::DataType::FLOAT32);
 505     auto new_beta = luci::clone(beta);
 506     if (granularity == QuantizationGranularity::LayerWise)
 507       quant_const(new_beta, output_type);
 508     else if (granularity == QuantizationGranularity::ChannelWise)
 509       quant_const_per_channel(new_beta, output_type);
 510     node->beta(new_beta);
 511   }
 512 }
 513
 514 void QuantizeWeights::visit(luci::CirclePRelu *node)
 515 {
 516   LOGGER(l);
 517   INFO(l) << "QuantizeWeights QuantizeWeights::visit node: " << node->name() << std::endl;
 518
 519   auto alpha = loco::must_cast<luci::CircleConst *>(node->alpha());
 520
 521   if (!is_quantized(alpha))
 522   {
 523     assert(alpha->dtype() == loco::DataType::FLOAT32);
 524     auto new_alpha = luci::clone(alpha);
 525     if (granularity == QuantizationGranularity::LayerWise)
 526       quant_const(new_alpha, output_type);
 527     else if (granularity == QuantizationGranularity::ChannelWise)
 528       quant_const_per_channel(new_alpha, output_type);
 529     node->alpha(new_alpha);
 530   }
 531 }
 532
 533 void QuantizeWeights::visit(luci::CircleTransposeConv *node)
 534 {
 535   LOGGER(l);
 536   INFO(l) << "QuantizeWeights QuantizeWeights::visit node: " << node->name() << std::endl;
 537
 538   auto weights = loco::must_cast<luci::CircleConst *>(node->filter());
 539   if (!is_quantized(weights))
 540   {
 541     auto new_weights = luci::clone(weights);
 542     node->filter(new_weights);
 543     quantize_weights(new_weights);
 544   }
 545 }
 546
 547 void QuantizeWeights::visit(luci::CircleFullyConnected *node)
 548 {
 549   LOGGER(l);
 550   INFO(l) << "QuantizeWeights QuantizeWeights::visit node: " << node->name() << std::endl;
 551
 552   auto weights = loco::must_cast<luci::CircleConst *>(node->weights());
 553   if (!is_quantized(weights))
 554   {
 555     auto new_weights = luci::clone(weights);
 556     node->weights(new_weights);
 557     quantize_weights(new_weights);
 558   }
 559 }
 560
 561 void QuantizeWeights::visit(luci::CircleNode *) {}
 562
 563 } // namespace luci