compiler/record-minmax/src/RecordMinMax.cpp

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *    http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "RecordMinMax.h"
  18 #include "RecordFunction.h"
  19 #include "MinMaxObserver.h"
  20
  21 #include <luci/Importer.h>
  22 #include <luci/CircleExporter.h>
  23 #include <luci/CircleFileExpContract.h>
  24 #include <luci/IR/CircleQuantParam.h>
  25 #include <luci/Log.h>
  26 #include <dio_hdf5/HDF5Importer.h>
  27
  28 #include <dirent.h>
  29 #include <algorithm>
  30 #include <cmath>
  31 #include <fstream>
  32 #include <numeric>
  33 #include <stdexcept>
  34 #include <iostream>
  35 #include <random>
  36
  37 using Shape = std::vector<loco::Dimension>;
  38 using DataType = loco::DataType;
  39
  40 namespace
  41 {
  42
  43 // Max h5 file size for parallel recording in bytes = 1 GB
  44 const long h5_max_size_bytes = 1000000000;
  45
  46 long getH5FileSize(const std::string &input_data_path)
  47 {
  48   std::ifstream in_file(input_data_path, std::ios::binary);
  49   in_file.seekg(0, std::ios::end);
  50
  51   return in_file.tellg();
  52 }
  53
  54 uint32_t numElements(const luci::CircleNode *node)
  55 {
  56   uint32_t num_elements = 1;
  57   for (uint32_t i = 0; i < node->rank(); i++)
  58     num_elements *= node->dim(i).value();
  59
  60   return num_elements;
  61 }
  62
  63 // Throw exception if input has one of the following conditions.
  64 // 1. Have unknown dimension
  65 // 2. Number of elements is 0
  66 void checkInputDimension(const luci::CircleInput *input)
  67 {
  68   for (uint32_t i = 0; i < input->rank(); i++)
  69     if (!input->dim(i).known())
  70       throw std::runtime_error(input->name() + " has unknown dimension");
  71
  72   if (numElements(input) == 0)
  73     throw std::runtime_error(input->name() + " is a zero-sized input");
  74 }
  75
  76 void readDataFromFile(const std::string &filename, std::vector<char> &data, size_t data_size)
  77 {
  78   assert(data.size() == data_size); // FIX_CALLER_UNLESS
  79
  80   std::ifstream fs(filename, std::ifstream::binary);
  81   if (fs.fail())
  82     throw std::runtime_error("Cannot open file \"" + filename + "\".\n");
  83   if (fs.read(data.data(), data_size).fail())
  84     throw std::runtime_error("Failed to read data from file \"" + filename + "\".\n");
  85   if (fs.peek() != EOF)
  86     throw std::runtime_error("Input tensor size mismatches with \"" + filename + "\".\n");
  87 }
  88
  89 std::vector<uint8_t> genRandomBoolData(std::mt19937 &gen, uint32_t num_elements)
  90 {
  91   std::uniform_int_distribution<> dist(0, 1);
  92   std::vector<uint8_t> input_data(num_elements);
  93
  94   // Write random data
  95   for (auto &iter : input_data)
  96     iter = static_cast<uint8_t>(dist(gen));
  97
  98   return input_data;
  99 }
 100
 101 template <typename T>
 102 std::vector<T> genRandomIntData(std::mt19937 &gen, uint32_t num_elements, T min, T max)
 103 {
 104   std::uniform_int_distribution<T> dist(min, max);
 105   std::vector<T> input_data(num_elements);
 106
 107   // Write random data
 108   {
 109     auto const generator = [&gen, &dist]() { return dist(gen); };
 110     std::generate(begin(input_data), end(input_data), generator);
 111   }
 112
 113   return input_data;
 114 }
 115
 116 /**
 117  * @brief  getTensorSize will return size in bytes
 118  */
 119 template <typename NodeT> size_t getTensorSize(const NodeT *node)
 120 {
 121   uint32_t tensor_size = loco::size(node->dtype());
 122   for (uint32_t i = 0; i < node->rank(); ++i)
 123     tensor_size *= node->dim(i).value();
 124   return tensor_size;
 125 }
 126
 127 /**
 128  * @brief  verifyTypeShape checks the type and the shape of CircleInput
 129  *         This throws an exception if type or shape does not match
 130  */
 131 void verifyTypeShape(const luci::CircleInput *input_node, const DataType &dtype, const Shape &shape)
 132 {
 133   // Type check
 134   if (dtype != input_node->dtype())
 135     throw std::runtime_error("Wrong input type.");
 136
 137   if (shape.size() != input_node->rank())
 138     throw std::runtime_error("Input rank mismatch.");
 139
 140   for (uint32_t i = 0; i < shape.size(); i++)
 141   {
 142     if (not(shape.at(i) == input_node->dim(i)))
 143       throw std::runtime_error("Input shape mismatch.");
 144   }
 145 }
 146
 147 void update_quantparam(record_minmax::MinMaxObserver *observer, const std::string &mode,
 148                        float min_percentile, float max_percentile)
 149 {
 150   auto minmax_map = observer->minMaxData()->getMap();
 151   for (auto iter = minmax_map->begin(); iter != minmax_map->end(); ++iter)
 152   {
 153     auto node = iter->first;
 154     auto minmax = iter->second;
 155
 156     float min{0.0f}, max{0.0f};
 157     if (mode == "percentile")
 158     {
 159       min = record_minmax::getNthPercentile(minmax.min_vector, min_percentile);
 160       max = record_minmax::getNthPercentile(minmax.max_vector, max_percentile);
 161     }
 162     else if (mode == "moving_average")
 163     {
 164       min = record_minmax::getMovingAverage(minmax.min_vector, 0.9, 16, true);
 165       max = record_minmax::getMovingAverage(minmax.max_vector, 0.9, 16, false);
 166     }
 167     assert(mode == "percentile" || mode == "moving_average");
 168     auto quantparam = std::make_unique<luci::CircleQuantParam>();
 169     quantparam->min.push_back(min);
 170     quantparam->max.push_back(max);
 171
 172     assert(node->quantparam() == nullptr);
 173
 174     auto mutable_node = const_cast<luci::CircleNode *>(node);
 175     mutable_node->quantparam(std::move(quantparam));
 176   }
 177 }
 178
 179 } // namespace
 180
 181 namespace record_minmax
 182 {
 183
 184 void RecordMinMax::initialize(const std::string &input_model_path)
 185 {
 186   assert(_threads_size > 0);
 187
 188   // Load model from the file
 189   std::ifstream fs(input_model_path, std::ifstream::binary);
 190   if (fs.fail())
 191   {
 192     throw std::runtime_error("Cannot open model file \"" + input_model_path + "\".\n");
 193   }
 194   std::vector<char> model_data((std::istreambuf_iterator<char>(fs)),
 195                                std::istreambuf_iterator<char>());
 196
 197   // Verify flatbuffers
 198   flatbuffers::Verifier verifier{reinterpret_cast<const uint8_t *>(model_data.data()),
 199                                  model_data.size()};
 200   if (!circle::VerifyModelBuffer(verifier))
 201   {
 202     throw std::runtime_error("Failed to verify circle '" + input_model_path + "'");
 203   }
 204
 205   const circle::Model *circle_model = circle::GetModel(model_data.data());
 206   if (circle_model == nullptr)
 207   {
 208     throw std::runtime_error("Failed to load '" + input_model_path + "'");
 209   }
 210
 211   _module = luci::Importer().importModule(circle_model);
 212
 213   if (_module == nullptr)
 214   {
 215     throw std::runtime_error("Failed to load '" + input_model_path + "'");
 216   }
 217
 218   // Create and initialize interpreters and observers
 219   _interpreters.resize(_threads_size);
 220   _observers.resize(_threads_size);
 221
 222   for (uint32_t thread_idx = 0; thread_idx < _threads_size; ++thread_idx)
 223   {
 224     auto interpreter = std::make_unique<luci_interpreter::Interpreter>(_module.get());
 225     auto observer = std::make_unique<MinMaxObserver>();
 226
 227     interpreter->attachObserver(observer.get());
 228
 229     _observers[thread_idx] = std::move(observer);
 230     _interpreters[thread_idx] = std::move(interpreter);
 231   }
 232 }
 233
 234 // input_data_path is a path to the directory
 235 // The directory should contain binary files each of which is a raw data,
 236 // ready to be consumed by the input circle model without any modification
 237 // TODO reduce duplicate codes with profileRawData
 238 void RecordMinMax::profileRawDataDirectory(const std::string &mode,
 239                                            const std::string &input_data_path, float min_percentile,
 240                                            float max_percentile)
 241 {
 242   struct dirent *entry = nullptr;
 243   DIR *dp = nullptr;
 244
 245   dp = opendir(input_data_path.c_str());
 246   if (not dp)
 247     throw std::runtime_error("Cannot open directory. Please check \"" + input_data_path +
 248                              "\" is a directory.\n");
 249
 250   uint32_t num_records = 0;
 251   const auto input_nodes = loco::input_nodes(_module->graph());
 252
 253   // Get total input size
 254   uint32_t total_input_size = 0;
 255   for (auto input : input_nodes)
 256   {
 257     const auto *input_node = loco::must_cast<const luci::CircleInput *>(input);
 258     checkInputDimension(input_node);
 259     total_input_size += getTensorSize(input_node);
 260   }
 261
 262   while ((entry = readdir(dp)))
 263   {
 264     // Skip if the entry is not a regular file
 265     if (entry->d_type != DT_REG)
 266       continue;
 267
 268     const std::string filename = entry->d_name;
 269     std::cout << "Recording " << num_records << "'th data" << std::endl;
 270
 271     // Read data from file to buffer
 272     // Assumption: For a multi-input model, the binary file should have inputs concatenated in the
 273     // same order with the input index.
 274     std::vector<char> input_data(total_input_size);
 275     readDataFromFile(input_data_path + "/" + filename, input_data, total_input_size);
 276
 277     // Write data from buffer to interpreter
 278     uint32_t offset = 0;
 279     for (auto input : input_nodes)
 280     {
 281       const auto *input_node = loco::must_cast<const luci::CircleInput *>(input);
 282       const auto input_size = getTensorSize(input_node);
 283       getInterpreter()->writeInputTensor(input_node, input_data.data() + offset, input_size);
 284
 285       offset += input_size;
 286     }
 287
 288     getInterpreter()->interpret();
 289
 290     num_records++;
 291   }
 292
 293   closedir(dp);
 294
 295   if (num_records == 0)
 296     throw std::runtime_error("The input data file does not contain any record.");
 297
 298   std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
 299
 300   update_quantparam(getObserver(), mode, min_percentile, max_percentile);
 301 }
 302
 303 // input_data_path is a text file which specifies the representative data
 304 // The text file should contain absolute file path per line.
 305 // The pointed file should be a binary file containing one representative data,
 306 // ready to be consumed by the input circle model without any modification
 307 // NOTE If a model has multiple inputs, the binary file should have inputs concatenated in the same
 308 // order with the input index of the circle model.
 309 void RecordMinMax::profileRawData(const std::string &mode, const std::string &input_data_path,
 310                                   float min_percentile, float max_percentile)
 311 {
 312   std::ifstream input_file(input_data_path);
 313   if (input_file.fail())
 314     throw std::runtime_error("Cannot open file \"" + input_data_path + "\".\n");
 315
 316   std::string record;
 317   uint32_t num_records = 0;
 318   const auto input_nodes = loco::input_nodes(_module->graph());
 319
 320   // Get total input size
 321   uint32_t total_input_size = 0;
 322   for (auto input : input_nodes)
 323   {
 324     const auto *input_node = loco::must_cast<const luci::CircleInput *>(input);
 325     checkInputDimension(input_node);
 326     total_input_size += getTensorSize(input_node);
 327   }
 328
 329   while (getline(input_file, record))
 330   {
 331     std::cout << "Recording " << num_records << "'th data" << std::endl;
 332
 333     // Read data from file to buffer
 334     // Assumption: For a multi-input model, the binary file should have inputs concatenated in the
 335     // same order with the input index.
 336     std::vector<char> input_data(total_input_size);
 337     readDataFromFile(record, input_data, total_input_size);
 338
 339     // Write data from buffer to interpreter
 340     uint32_t offset = 0;
 341     for (auto input : input_nodes)
 342     {
 343       const auto *input_node = loco::must_cast<const luci::CircleInput *>(input);
 344       const auto input_size = getTensorSize(input_node);
 345       getInterpreter()->writeInputTensor(input_node, input_data.data() + offset, input_size);
 346
 347       offset += input_size;
 348     }
 349
 350     getInterpreter()->interpret();
 351
 352     num_records++;
 353   }
 354
 355   if (num_records == 0)
 356     throw std::runtime_error("The input data file does not contain any record.");
 357
 358   std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
 359
 360   update_quantparam(getObserver(), mode, min_percentile, max_percentile);
 361 }
 362
 363 WholeOutput RecordMinMax::importH5Data(const std::string &input_data_path)
 364 {
 365   try
 366   {
 367     dio::hdf5::HDF5Importer importer(input_data_path);
 368     importer.importGroup("value");
 369
 370     bool is_raw_data = importer.isRawData();
 371
 372     const auto num_records = importer.numData();
 373     if (num_records == 0)
 374       throw std::runtime_error("The input data file does not contain any record.");
 375
 376     const auto input_nodes = loco::input_nodes(_module->graph());
 377     const auto num_inputs = input_nodes.size();
 378
 379     WholeOutput whole_output(num_records);
 380
 381     // Read inputs to whole_output
 382     for (int i = 0; i < num_records; ++i)
 383     {
 384       if (num_inputs != static_cast<uint32_t>(importer.numInputs(i)))
 385         throw std::runtime_error("Wrong number of inputs.");
 386
 387       for (uint32_t input_idx = 0; input_idx < num_inputs; input_idx++)
 388       {
 389         const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
 390         assert(input_node->index() == input_idx);
 391         checkInputDimension(input_node);
 392         Buffer input_data(getTensorSize(input_node));
 393
 394         if (!is_raw_data)
 395         {
 396           DataType dtype;
 397           Shape shape;
 398           importer.readTensor(i, input_idx, &dtype, &shape, input_data.data());
 399
 400           // Check the type and the shape of the input data is valid
 401           verifyTypeShape(input_node, dtype, shape);
 402         }
 403         else
 404         {
 405           // Skip type/shape check for raw data
 406           importer.readTensor(i, input_idx, input_data.data());
 407         }
 408         whole_output[i].emplace_back(std::move(input_data));
 409       }
 410     }
 411
 412     return whole_output;
 413   }
 414   catch (const H5::Exception &e)
 415   {
 416     H5::Exception::printErrorStack();
 417     throw std::runtime_error("HDF5 error occurred.");
 418   }
 419 }
 420
 421 void RecordMinMax::profileData(const std::string &mode, const std::string &input_data_path,
 422                                float min_percentile, float max_percentile)
 423 {
 424   try
 425   {
 426     dio::hdf5::HDF5Importer importer(input_data_path);
 427     importer.importGroup("value");
 428
 429     bool is_raw_data = importer.isRawData();
 430
 431     const auto num_records = importer.numData();
 432     if (num_records == 0)
 433       throw std::runtime_error("The input data file does not contain any record.");
 434
 435     const auto input_nodes = loco::input_nodes(_module->graph());
 436     const auto num_inputs = input_nodes.size();
 437
 438     for (int32_t record_idx = 0; record_idx < num_records; record_idx++)
 439     {
 440       if (num_inputs != static_cast<uint32_t>(importer.numInputs(record_idx)))
 441         throw std::runtime_error("Wrong number of inputs.");
 442
 443       std::cout << "Recording " << record_idx << "'th data" << std::endl;
 444
 445       for (uint32_t input_idx = 0; input_idx < num_inputs; input_idx++)
 446       {
 447         const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
 448         assert(input_node->index() == input_idx);
 449         checkInputDimension(input_node);
 450         std::vector<char> input_data(getTensorSize(input_node));
 451
 452         if (!is_raw_data)
 453         {
 454           DataType dtype;
 455           Shape shape;
 456           importer.readTensor(record_idx, input_idx, &dtype, &shape, input_data.data());
 457
 458           // Check the type and the shape of the input data is valid
 459           verifyTypeShape(input_node, dtype, shape);
 460         }
 461         else
 462         {
 463           // Skip type/shape check for raw data
 464           importer.readTensor(record_idx, input_idx, input_data.data());
 465         }
 466
 467         // TODO: Input data is copied twice (file -> buffer (input_data) -> interpreter inputs)
 468         //       We can redcue the copy by directly writing data from file to interpreter inputs
 469         getInterpreter()->writeInputTensor(input_node, input_data.data(), input_data.size());
 470       }
 471
 472       getInterpreter()->interpret();
 473     }
 474
 475     std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
 476   }
 477   catch (const H5::Exception &e)
 478   {
 479     H5::Exception::printErrorStack();
 480     throw std::runtime_error("HDF5 error occurred.");
 481   }
 482
 483   update_quantparam(getObserver(), mode, min_percentile, max_percentile);
 484 }
 485
 486 void RecordMinMax::profileDataInParallel(const std::string &mode,
 487                                          const std::string &input_data_path, float min_percentile,
 488                                          float max_percentile)
 489 {
 490   LOGGER(l);
 491
 492   assert(_interpreters.size() == _threads_size);
 493   assert(_observers.size() == _threads_size);
 494
 495   const long h5_file_size = getH5FileSize(input_data_path);
 496
 497   if (h5_file_size > h5_max_size_bytes)
 498     throw std::runtime_error("H5 file size is too large for parallel recording");
 499
 500   WholeOutput whole_output;
 501   try
 502   {
 503     whole_output = importH5Data(input_data_path);
 504   }
 505   catch (const std::bad_alloc &e)
 506   {
 507     throw std::runtime_error("Out of memory during h5 data load.");
 508   }
 509
 510   const auto num_records = whole_output.size();
 511   const auto input_nodes = loco::input_nodes(_module->graph());
 512
 513   // Start parallel part
 514   INFO(l) << _threads_size << " concurrent threads are supported." << std::endl;
 515
 516   const auto run_threads = num_records < _threads_size ? num_records : _threads_size;
 517
 518   const auto records_batch = static_cast<uint32_t>(num_records / run_threads);
 519
 520   auto interpret_batch = [&whole_output, &input_nodes](int first_record, int last_record,
 521                                                        luci_interpreter::Interpreter *interpreter) {
 522     for (int record_index = first_record; record_index < last_record; ++record_index)
 523     {
 524       for (uint32_t input_idx = 0; input_idx < input_nodes.size(); input_idx++)
 525       {
 526         const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
 527
 528         const auto &cur_input_data = whole_output[record_index][input_idx];
 529         interpreter->writeInputTensor(input_node, cur_input_data.data(), cur_input_data.size());
 530       }
 531       interpreter->interpret();
 532     }
 533   };
 534
 535   std::vector<std::thread> threads;
 536   for (uint32_t t = 0; t < run_threads; ++t)
 537   {
 538     if (t < run_threads - 1)
 539     {
 540       threads.emplace_back(interpret_batch, records_batch * t, records_batch * (t + 1),
 541                            _interpreters[t].get());
 542     }
 543     else
 544     {
 545       threads.emplace_back(interpret_batch, records_batch * t, num_records, _interpreters[t].get());
 546     }
 547   }
 548
 549   for (uint32_t i = 0; i < run_threads; ++i)
 550     threads.at(i).join();
 551
 552   // End parallel part
 553
 554   // Copy all min, max values to one observer
 555   auto observer = std::make_unique<MinMaxObserver>();
 556   auto main_min_max_map = const_cast<MinMaxMap *>(observer->minMaxData());
 557
 558   for (const auto &obs : _observers)
 559   {
 560     const auto cur_minmax_map = obs->minMaxData()->getMap();
 561     for (auto &iter : *cur_minmax_map)
 562     {
 563       const auto node = iter.first;
 564       const auto &minmax = iter.second;
 565
 566       main_min_max_map->appendMinMaxVector(node, minmax);
 567     }
 568   }
 569
 570   std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
 571
 572   update_quantparam(observer.get(), mode, min_percentile, max_percentile);
 573 }
 574
 575 void RecordMinMax::profileDataWithRandomInputs(const std::string &mode, float min_percentile,
 576                                                float max_percentile)
 577 {
 578   // We use three randomly-generated records
 579   const uint32_t num_records = 3;
 580
 581   const auto input_nodes = loco::input_nodes(_module->graph());
 582   const auto num_inputs = input_nodes.size();
 583
 584   std::random_device rd;
 585   std::mt19937 gen(rd());
 586   std::uniform_real_distribution<> dist(-5, 5);
 587
 588   for (uint32_t record_idx = 0; record_idx < num_records; record_idx++)
 589   {
 590     std::cout << "Recording " << record_idx << "'th data" << std::endl;
 591
 592     for (uint32_t input_idx = 0; input_idx < num_inputs; input_idx++)
 593     {
 594       const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
 595       assert(input_node->index() == input_idx);
 596       checkInputDimension(input_node);
 597
 598       const auto num_elements = numElements(input_node);
 599
 600       // TODO Support more input data types
 601       assert(input_node->dtype() == loco::DataType::FLOAT32 ||
 602              input_node->dtype() == loco::DataType::BOOL ||
 603              input_node->dtype() == loco::DataType::S32 ||
 604              input_node->dtype() == loco::DataType::S64);
 605
 606       if (input_node->dtype() == DataType::FLOAT32)
 607       {
 608         std::vector<float> input_data(num_elements);
 609
 610         // Write random data
 611         for (auto &iter : input_data)
 612           iter = static_cast<float>(dist(gen));
 613
 614         // TODO: Input data is copied twice (file -> buffer (input_data) -> interpreter inputs)
 615         //       We can redcue the copy by directly writing data from file to interpreter inputs
 616         getInterpreter()->writeInputTensor(input_node, input_data.data(),
 617                                            input_data.size() * sizeof(float));
 618       }
 619       else if (input_node->dtype() == DataType::BOOL)
 620       {
 621         auto input_data = genRandomBoolData(gen, num_elements);
 622         getInterpreter()->writeInputTensor(input_node, input_data.data(),
 623                                            input_data.size() * sizeof(uint8_t));
 624       }
 625       else if (input_node->dtype() == DataType::S32)
 626       {
 627         auto input_data = genRandomIntData<int32_t>(gen, num_elements, 0, 100);
 628         getInterpreter()->writeInputTensor(input_node, input_data.data(),
 629                                            input_data.size() * sizeof(int32_t));
 630       }
 631       else if (input_node->dtype() == DataType::S64)
 632       {
 633         auto input_data = genRandomIntData<int64_t>(gen, num_elements, 0, 100);
 634         getInterpreter()->writeInputTensor(input_node, input_data.data(),
 635                                            input_data.size() * sizeof(int64_t));
 636       }
 637     }
 638
 639     getInterpreter()->interpret();
 640   }
 641
 642   std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
 643
 644   update_quantparam(getObserver(), mode, min_percentile, max_percentile);
 645 }
 646
 647 void RecordMinMax::saveModel(const std::string &output_model_path)
 648 {
 649   // Export to output Circle file
 650   luci::CircleExporter exporter;
 651
 652   luci::CircleFileExpContract contract(_module.get(), output_model_path);
 653
 654   if (!exporter.invoke(&contract))
 655   {
 656     throw std::runtime_error("Failed to export '" + output_model_path + "'");
 657   }
 658 }
 659
 660 } // namespace record_minmax