compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *    http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "luci/Pass/QuantizeDequantizeWeightsPass.h"
  18 #include "QuantizationUtils.h"
  19
  20 #include <luci/IR/CircleNodes.h>
  21 #include <luci/IR/CircleNodeVisitor.h>
  22 #include <luci/Log.h>
  23 #include <loco/IR/TensorShape.h>
  24
  25 #include <iostream>
  26 #include <cmath>
  27
  28 namespace luci
  29 {
  30
  31 namespace
  32 {
  33
  34 void cal_minmax_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max)
  35 {
  36   loco::TensorShape dimension;
  37   dimension.rank(4);
  38   uint32_t indices[4] = {
  39       0,
  40   };
  41   int channel_dim_index{0};
  42   int size{0};
  43
  44   if (!get_channel_dim_index(node, dimension, channel_dim_index))
  45   {
  46     assert(false);
  47     return;
  48   }
  49   size = dimension.dim(channel_dim_index).value();
  50
  51   std::vector<bool> has_min_max_value(size, false);
  52   min.resize(size);
  53   max.resize(size);
  54   for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
  55   {
  56     for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
  57     {
  58       for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
  59       {
  60         for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
  61         {
  62           int channel_idx = indices[channel_dim_index];
  63           auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
  64           if (has_min_max_value[channel_idx])
  65           {
  66             min[channel_idx] = data < min[channel_idx] ? data : min[channel_idx];
  67             max[channel_idx] = data > max[channel_idx] ? data : max[channel_idx];
  68           }
  69           else
  70           {
  71             min[channel_idx] = data;
  72             max[channel_idx] = data;
  73             has_min_max_value[channel_idx] = true;
  74           }
  75         }
  76       }
  77     }
  78   }
  79 }
  80
  81 void sym_wquant_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max,
  82                             std::vector<float> &scaling_factor, std::vector<int64_t> &zp,
  83                             std::vector<float> &nudged_min, std::vector<float> &nudged_max)
  84 {
  85   assert(node->dtype() == loco::DataType::FLOAT32);
  86   const int32_t kMaxScale = std::numeric_limits<int16_t>::max();
  87   const int32_t kMinScale = -kMaxScale;
  88
  89   uint32_t size = node->size<loco::DataType::FLOAT32>();
  90   std::vector<int32_t> quantized_values(size);
  91
  92   for (size_t i = 0; i < min.size(); ++i)
  93   {
  94     compute_sym_scale_zp(min[i], max[i], scaling_factor[i], zp[i], nudged_min[i], nudged_max[i]);
  95   }
  96
  97   loco::TensorShape dimension;
  98   dimension.rank(4);
  99   uint32_t indices[4] = {
 100       0,
 101   };
 102   int channel_dim_index{0};
 103
 104   if (!get_channel_dim_index(node, dimension, channel_dim_index))
 105   {
 106     assert(false);
 107     return;
 108   }
 109
 110   for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
 111   {
 112     for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
 113     {
 114       for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
 115       {
 116         for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
 117         {
 118           int channel_idx = indices[channel_dim_index];
 119           const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
 120           auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
 121           data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
 122           data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
 123           quantized_values[cal_offset(dimension, indices)] =
 124               static_cast<int32_t>(std::round(data * scaling_factor_inv));
 125         }
 126       }
 127     }
 128   }
 129
 130   node->dtype(loco::DataType::S16);      // change the type of tensor
 131   node->size<loco::DataType::S16>(size); // resize tensor
 132   for (uint32_t i = 0; i < size; ++i)
 133   {
 134     node->at<loco::DataType::S16>(i) =
 135         std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
 136   }
 137 }
 138
 139 void sym_wdequant_per_channel(CircleConst *node, std::vector<float> &scaling_factor)
 140 {
 141   assert(node->dtype() == loco::DataType::S16);
 142   uint32_t size = node->size<loco::DataType::S16>();
 143   std::vector<float> dequantized_values(size);
 144
 145   loco::TensorShape dimension;
 146   dimension.rank(4);
 147   uint32_t indices[4] = {
 148       0,
 149   };
 150   int channel_dim_index{0};
 151
 152   if (!get_channel_dim_index(node, dimension, channel_dim_index))
 153   {
 154     assert(false);
 155     return;
 156   }
 157
 158   for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
 159   {
 160     for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
 161     {
 162       for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
 163       {
 164         for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
 165         {
 166           int channel_idx = indices[channel_dim_index];
 167           auto data = node->at<loco::DataType::S16>(cal_offset(dimension, indices));
 168           dequantized_values[cal_offset(dimension, indices)] =
 169               static_cast<float>(data) * scaling_factor[channel_idx];
 170         }
 171       }
 172     }
 173   }
 174
 175   node->dtype(loco::DataType::FLOAT32);      // change the type of tensor
 176   node->size<loco::DataType::FLOAT32>(size); // resize tensor
 177   for (uint32_t i = 0; i < size; ++i)
 178   {
 179     node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i];
 180   }
 181 }
 182
 183 void asymmetric_wquant_per_channel(CircleConst *node, std::vector<float> &min,
 184                                    std::vector<float> &max, std::vector<float> &scaling_factor,
 185                                    std::vector<int64_t> &zp, std::vector<float> &nudged_min,
 186                                    std::vector<float> &nudged_max)
 187 {
 188   assert(node->dtype() == loco::DataType::FLOAT32);
 189
 190   const int32_t kMinScale = 0;
 191   const int32_t kMaxScale = 255;
 192
 193   uint32_t size = node->size<loco::DataType::FLOAT32>();
 194   std::vector<int32_t> quantized_values(size);
 195
 196   for (size_t i = 0; i < min.size(); ++i)
 197   {
 198     compute_asym_scale_zp(min[i], max[i], scaling_factor[i], zp[i], nudged_min[i], nudged_max[i]);
 199   }
 200
 201   loco::TensorShape dimension;
 202   dimension.rank(4);
 203   uint32_t indices[4] = {
 204       0,
 205   };
 206   int channel_dim_index{0};
 207
 208   if (!get_channel_dim_index(node, dimension, channel_dim_index))
 209   {
 210     assert(false);
 211     return;
 212   }
 213
 214   for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
 215   {
 216     for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
 217     {
 218       for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
 219       {
 220         for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
 221         {
 222           int channel_idx = indices[channel_dim_index];
 223           const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
 224           auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
 225           data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
 226           data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
 227           quantized_values[cal_offset(dimension, indices)] = static_cast<int32_t>(
 228               std::round((data - nudged_min[channel_idx]) * scaling_factor_inv));
 229         }
 230       }
 231     }
 232   }
 233
 234   node->dtype(loco::DataType::U8);      // change the type of tensor
 235   node->size<loco::DataType::U8>(size); // resize tensor
 236   for (uint32_t i = 0; i < size; ++i)
 237   {
 238     node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
 239   }
 240 }
 241
 242 void asymmetric_wdequant_per_channel(CircleConst *node, std::vector<float> &scaling_factor,
 243                                      std::vector<float> &nudged_min)
 244 {
 245   assert(node->dtype() == loco::DataType::U8);
 246   uint32_t size = node->size<loco::DataType::U8>();
 247   std::vector<float> dequantized_values(size);
 248
 249   loco::TensorShape dimension;
 250   dimension.rank(4);
 251   uint32_t indices[4] = {
 252       0,
 253   };
 254   int channel_dim_index{0};
 255
 256   if (!get_channel_dim_index(node, dimension, channel_dim_index))
 257   {
 258     assert(false);
 259     return;
 260   }
 261
 262   for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
 263   {
 264     for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
 265     {
 266       for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
 267       {
 268         for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
 269         {
 270           int channel_idx = indices[channel_dim_index];
 271           auto data = node->at<loco::DataType::U8>(cal_offset(dimension, indices));
 272           dequantized_values[cal_offset(dimension, indices)] =
 273               static_cast<float>(data) * scaling_factor[channel_idx] + nudged_min[channel_idx];
 274         }
 275       }
 276     }
 277   }
 278
 279   node->dtype(loco::DataType::FLOAT32);      // change the type of tensor
 280   node->size<loco::DataType::FLOAT32>(size); // resize tensor
 281   for (uint32_t i = 0; i < size; ++i)
 282   {
 283     node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i];
 284   }
 285 }
 286
 287 void asymmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float max,
 288                                              float &scaling_factor, int64_t &zp, float &nudged_min,
 289                                              float &nudged_max)
 290 {
 291
 292   const int32_t kMinScale = 0;
 293   const int32_t kMaxScale = 255;
 294
 295   uint32_t size = node->size<loco::DataType::FLOAT32>();
 296   compute_asym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max);
 297   const float scaling_factor_inv = 1.0 / scaling_factor;
 298   std::vector<int32_t> quantized_values(size);
 299   for (uint32_t i = 0; i < size; ++i)
 300   {
 301     // clipping
 302     auto data = node->at<loco::DataType::FLOAT32>(i);
 303     data = data < nudged_min ? nudged_min : data;
 304     data = data > nudged_max ? nudged_max : data;
 305     quantized_values[i] =
 306         static_cast<int32_t>(std::round((data - nudged_min) * scaling_factor_inv));
 307   }
 308
 309   node->dtype(loco::DataType::U8);      // change the type of tensor
 310   node->size<loco::DataType::U8>(size); // resize tensor
 311   for (uint32_t i = 0; i < size; ++i)
 312   {
 313     node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
 314   }
 315 }
 316
 317 void asymmetric_wdequant_with_minmax_per_layer(CircleConst *node, float scaling_factor,
 318                                                float nudged_min)
 319 {
 320   uint32_t size = node->size<loco::DataType::U8>();
 321   std::vector<float> dequantized_values(size);
 322   for (uint32_t i = 0; i < size; ++i)
 323   {
 324     auto data = node->at<loco::DataType::U8>(i);
 325     dequantized_values[i] = static_cast<float>(data) * scaling_factor + nudged_min;
 326   }
 327
 328   node->dtype(loco::DataType::FLOAT32);      // change the type of tensor
 329   node->size<loco::DataType::FLOAT32>(size); // resize tensor
 330   for (uint32_t i = 0; i < size; ++i)
 331   {
 332     node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i];
 333   }
 334 }
 335
 336 bool is_quantized(const CircleNode *node)
 337 {
 338   return node->dtype() == loco::DataType::U8 ||  // activation, weight
 339          node->dtype() == loco::DataType::S16 || // activation, weight
 340          node->dtype() == loco::DataType::S32;   // bias
 341 }
 342
 343 // Check if node is weights of conv2d, transepose_conv2d, depthwise_conv2d, or fully_connected layer
 344 bool is_weights(CircleNode *node)
 345 {
 346   auto circle_const = dynamic_cast<CircleConst *>(node);
 347   if (circle_const == nullptr)
 348     return false;
 349
 350   auto succs = loco::succs(node);
 351   if (succs.size() != 1) // assume weights is used by only one node
 352     return false;
 353
 354   for (auto out : succs)
 355   {
 356     auto conv = dynamic_cast<CircleConv2D *>(out);
 357     if (conv != nullptr && conv->filter() == circle_const && circle_const->rank() == 4)
 358       return true;
 359
 360     auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out);
 361     if (dw_conv != nullptr && dw_conv->filter() == circle_const && circle_const->rank() == 4)
 362       return true;
 363
 364     auto tw_conv = dynamic_cast<CircleTransposeConv *>(out);
 365     if (tw_conv != nullptr && tw_conv->filter() == circle_const && circle_const->rank() == 4)
 366       return true;
 367
 368     auto fc = dynamic_cast<CircleFullyConnected *>(out);
 369     if (fc != nullptr && fc->weights() == circle_const && circle_const->rank() == 2)
 370       return true;
 371   }
 372   return false;
 373 }
 374
 375 /**
 376  * @brief QuantizeDequantizeWeights quantizes and dequantizes tensors for weights
 377  * @details Find min/max values on the fly, quantize the model, and dequantize the model
 378  */
 379 struct QuantizeDequantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
 380 {
 381   QuantizeDequantizeWeights(loco::DataType input, loco::DataType output,
 382                             QuantizationGranularity granularity)
 383       : input_type(input), output_type(output), granularity(granularity)
 384   {
 385   }
 386
 387   loco::DataType input_type;
 388   loco::DataType output_type;
 389   QuantizationGranularity granularity;
 390
 391   // Quantize and dequantize input tensors of each node
 392   bool visit(luci::CircleNode *node)
 393   {
 394     assert(output_type == loco::DataType::U8 || output_type == loco::DataType::S16);
 395     LOGGER(l);
 396     INFO(l) << "QuantizeDequantizeWeights visit node: " << node->name() << std::endl;
 397     auto arity = node->arity();
 398     for (uint32_t i = 0; i < arity; i++)
 399     {
 400       auto input_node = node->arg(i);
 401       auto circle_node = loco::must_cast<luci::CircleNode *>(input_node);
 402
 403       // Check if this is already quantized
 404       if (is_quantized(circle_node))
 405         continue;
 406
 407       if (is_weights(circle_node))
 408       {
 409         auto circle_const = loco::must_cast<luci::CircleConst *>(circle_node);
 410
 411         // Find min/max per channel-wise
 412         if (granularity == QuantizationGranularity::ChannelWise)
 413         {
 414           std::vector<float> min;
 415           std::vector<float> max;
 416
 417           cal_minmax_per_channel(circle_const, min, max);
 418
 419           std::vector<float> nudged_min(min.size());
 420           std::vector<float> nudged_max(min.size());
 421           std::vector<float> scaling_factor(min.size());
 422           std::vector<int64_t> zp(min.size());
 423
 424           if (output_type == loco::DataType::U8)
 425           {
 426             asymmetric_wquant_per_channel(circle_const, min, max, scaling_factor, zp, nudged_min,
 427                                           nudged_max);
 428             asymmetric_wdequant_per_channel(circle_const, scaling_factor, nudged_min);
 429           }
 430           else
 431           {
 432             sym_wquant_per_channel(circle_const, min, max, scaling_factor, zp, nudged_min,
 433                                    nudged_max);
 434             sym_wdequant_per_channel(circle_const, scaling_factor);
 435           }
 436
 437           auto quantparam = std::make_unique<CircleQuantParam>();
 438           quantparam->min = nudged_min;
 439           quantparam->max = nudged_max;
 440           quantparam->scale = scaling_factor;
 441           quantparam->zerop = zp;
 442           circle_node->quantparam(std::move(quantparam));
 443         }
 444         // Find min/max per layer-wise
 445         else
 446         {
 447           float min = std::numeric_limits<float>::max();
 448           float max = std::numeric_limits<float>::lowest();
 449           for (uint32_t i = 0; i < circle_const->size<loco::DataType::FLOAT32>(); i++)
 450           {
 451             auto data = circle_const->at<loco::DataType::FLOAT32>(i);
 452             min = data < min ? data : min;
 453             max = data > max ? data : max;
 454           }
 455           float scaling_factor{0};
 456           int64_t zp{0};
 457           float nudged_min{0};
 458           float nudged_max{0};
 459
 460           asymmetric_wquant_with_minmax_per_layer(circle_const, min, max, scaling_factor, zp,
 461                                                   nudged_min, nudged_max);
 462           asymmetric_wdequant_with_minmax_per_layer(circle_const, scaling_factor, nudged_min);
 463           auto quantparam = std::make_unique<CircleQuantParam>();
 464           quantparam->min.push_back(nudged_min);
 465           quantparam->max.push_back(nudged_max);
 466           quantparam->scale.push_back(scaling_factor);
 467           quantparam->zerop.push_back(zp);
 468           circle_node->quantparam(std::move(quantparam));
 469         }
 470       }
 471     }
 472     return false;
 473   }
 474 };
 475
 476 } // namespace
 477
 478 bool QuantizeDequantizeWeightsPass::run(loco::Graph *g)
 479 {
 480   LOGGER(l);
 481   INFO(l) << "QuantizeDequantizeWeightsPass Start" << std::endl;
 482
 483   // Quantize weights
 484   for (auto node : loco::active_nodes(loco::output_nodes(g)))
 485   {
 486     QuantizeDequantizeWeights qw(_input_dtype, _output_dtype, _granularity);
 487     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
 488     circle_node->accept(&qw);
 489   }
 490
 491   INFO(l) << "QuantizeDequantizeWeightsPass End" << std::endl;
 492   return false; // one time run
 493 }
 494
 495 } // namespace luci