inference-engine/src/vpu/graph_transformer/src/passes/sw_deconv_adaptation.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include <vpu/pass_manager.hpp>
   6
   7 #include <vector>
   8 #include <string>
   9 #include <memory>
  10 #include <unordered_set>
  11 #include <set>
  12
  13 #include <ie_parallel.hpp>
  14
  15 #include <vpu/sw/utility.hpp>
  16 #include <vpu/utils/numeric.hpp>
  17
  18 namespace vpu {
  19
  20 namespace {
  21
  22 void depthDeconvolutionRelayoutCHW(
  23         const fp16_t* src, int src_size,
  24         fp16_t* dst, int dst_size,
  25         int KX, int KY,
  26         int channels) {
  27     ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
  28         int iidx = c * KX * KY + ky * KX + kx;
  29         IE_ASSERT(iidx >= 0 && iidx < src_size);
  30
  31         int inv_kx = KX - kx - 1;
  32         int inv_ky = KY - ky - 1;
  33         int oidx = c * KX * KY + inv_ky * KX + inv_kx;
  34         IE_ASSERT(oidx >= 0 && oidx < dst_size);
  35
  36         dst[oidx] = src[iidx];
  37     });
  38 }
  39
  40 class DepthDeconvolutionCHWWeightsContent final : public CalculatedDataContent {
  41 public:
  42     DepthDeconvolutionCHWWeightsContent(
  43             const DataContent::Ptr& origContent,
  44             int KX, int KY, int channels) :
  45             CalculatedDataContent({origContent}),
  46             _KX(KX), _KY(KY), _channels(channels) {
  47     }
  48
  49 protected:
  50     void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
  51         VPU_PROFILE(DepthDeconvolutionCHWWeightsContent);
  52         depthDeconvolutionRelayoutCHW(
  53             baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
  54             static_cast<fp16_t*>(tempBuf), _desc.totalDimSize(),
  55             _KX, _KY, _channels);
  56     }
  57
  58 private:
  59     int _KX;
  60     int _KY;
  61     int _channels;
  62 };
  63
  64 void depthDeconvolutionRelayoutHWC(
  65         const fp16_t* src, int src_size,
  66         fp16_t* dst, int dst_size,
  67         int KX, int KY,
  68         int channels) {
  69     ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
  70         int iidx = c * KX * KY + ky * KX + kx;
  71         IE_ASSERT(iidx < src_size);
  72
  73         int inv_kx = KX - kx - 1;
  74         int inv_ky = KY - ky - 1;
  75         int oidx = inv_ky * KX * channels + inv_kx * channels + c;
  76         IE_ASSERT(oidx < dst_size);
  77
  78         dst[oidx] = src[iidx];
  79     });
  80 }
  81
  82 class DepthDeconvolutionHWCWeightsContent final : public CalculatedDataContent {
  83 public:
  84     DepthDeconvolutionHWCWeightsContent(
  85             const DataContent::Ptr& origContent,
  86             int KX, int KY, int channels) :
  87             CalculatedDataContent({origContent}),
  88             _KX(KX), _KY(KY), _channels(channels) {
  89     }
  90
  91 protected:
  92     void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
  93         VPU_PROFILE(DepthDeconvolutionHWCWeightsContent);
  94         depthDeconvolutionRelayoutHWC(
  95             baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
  96             static_cast<fp16_t*>(tempBuf), _desc.totalDimSize(),
  97             _KX, _KY, _channels);
  98     }
  99
 100 private:
 101     int _KX;
 102     int _KY;
 103     int _channels;
 104 };
 105
 106 void deconvolutionRelayout(
 107     const fp16_t* src, int src_size,
 108     fp16_t* dst, int dst_size,
 109     int KX, int KY,
 110     int IC, int OC) {
 111     ie::parallel_for4d(OC, IC, KY, KX, [=](int oc, int ic, int ky, int kx) {
 112         int iidx = ic * OC * KY * KX
 113                  + oc * KY * KX
 114                  + ky * KX
 115                  + kx;
 116         IE_ASSERT(iidx >= 0 && iidx < src_size);
 117
 118         int inv_kx = KX - kx - 1;
 119         int inv_ky = KY - ky - 1;
 120         int oidx = oc * IC * KY * KX
 121                  + ic * KY * KX
 122                  + inv_ky * KX
 123                  + inv_kx;
 124         IE_ASSERT(oidx >=  0 && oidx < dst_size);
 125
 126         dst[oidx] = src[iidx];
 127     });
 128 }
 129
 130 class DeconvolutionWeightsContent final : public CalculatedDataContent {
 131 public:
 132     DeconvolutionWeightsContent(
 133             const DataContent::Ptr& origContent,
 134             int KX, int KY,
 135             int IC, int OC) :
 136             CalculatedDataContent({origContent}),
 137             _KX(KX), _KY(KY),
 138             _IC(IC), _OC(OC) {
 139     }
 140
 141 protected:
 142     size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
 143         return 2 * _desc.totalDimSize() * sizeof(fp16_t);
 144     }
 145
 146
 147     void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
 148         VPU_PROFILE(DeconvolutionWeightsContent);
 149
 150         auto dstPtr = static_cast<fp16_t*>(tempBuf);
 151         auto dstPtr2 = dstPtr + _desc.totalDimSize();
 152
 153         deconvolutionRelayout(
 154             baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
 155             dstPtr2, _desc.totalDimSize(),
 156             _KX, _KY,
 157             _IC, _OC);
 158
 159         kchw_to_hwkc(dstPtr2, dstPtr, _desc);
 160     }
 161
 162 private:
 163     int _KX;
 164     int _KY;
 165     int _IC;
 166     int _OC;
 167 };
 168
 169 class DeconvStage final : public StageNode {
 170 private:
 171     StagePtr cloneImpl() const override {
 172         return std::make_shared<DeconvStage>(*this);
 173     }
 174
 175     DataMap<float> propagateScaleFactorsImpl(
 176             const DataMap<float>&,
 177             ScalePropagationStep) override {
 178         VPU_THROW_EXCEPTION << "Must never be called";
 179     }
 180
 181     DataMap<DimsOrder> propagateDataOrderImpl() const override {
 182         IE_ASSERT(_inputEdges.size() == 3);
 183         IE_ASSERT(_outputEdges.size() == 1);
 184
 185         auto input = _inputEdges[0]->input();
 186         auto weights = _inputEdges[1]->input();
 187         auto output = _outputEdges[0]->output();
 188
 189         auto finalOrder = input->desc().dimsOrder();
 190         if (finalOrder.dimInd(Dim::C) == 1) {
 191             // HCW -> CHW
 192             finalOrder.moveDim(Dim::C, 2);
 193         }
 194
 195         DataMap<DimsOrder> out;
 196
 197         if (_type == StageType::DepthDeconv) {
 198             if (finalOrder != input->desc().dimsOrder()) {
 199                 out[input] = finalOrder;
 200             }
 201             out[output] = finalOrder;
 202         } else {
 203             out[input] = finalOrder.createMovedDim(Dim::C, 0);
 204             out[output] = finalOrder.createMovedDim(Dim::C, 0);
 205         }
 206
 207         return out;
 208     }
 209
 210     DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
 211         IE_ASSERT(_inputEdges.size() == 3);
 212         IE_ASSERT(_outputEdges.size() == 1);
 213
 214         auto input = _inputEdges[0]->input();
 215         auto weights = _inputEdges[1]->input();
 216         auto output = _outputEdges[0]->output();
 217
 218         auto finalOrder = input->desc().dimsOrder();
 219         if (finalOrder.dimInd(Dim::C) == 1) {
 220             // HCW -> CHW
 221             finalOrder.moveDim(Dim::C, 2);
 222         }
 223
 224         DataMap<StridesRequirement> out;
 225
 226         if (_type == StageType::DepthDeconv) {
 227             if (finalOrder.dimInd(Dim::C) == 0) {
 228                 // HWC
 229                 out[input] = StridesRequirement::compact();
 230                 out[output] = StridesRequirement::compact();
 231             }
 232         } else {
 233             out[input] = StridesRequirement::compact();
 234             out[output] = StridesRequirement::compact();
 235         }
 236
 237         return out;
 238     }
 239
 240     void finalizeDataLayoutImpl() override {
 241         IE_ASSERT(_inputEdges.size() == 3);
 242         IE_ASSERT(_outputEdges.size() == 1);
 243
 244         auto input = _inputEdges[0]->input();
 245         auto weights = _inputEdges[1]->input();
 246         auto output = _outputEdges[0]->output();
 247
 248         auto kernelSizeX = attrs().get<int>("kernelSizeX");
 249         auto kernelSizeY = attrs().get<int>("kernelSizeY");
 250
 251         Data swWeights;
 252
 253         if (_type == StageType::DepthDeconv) {
 254             if (input->desc().dimsOrder().dimInd(Dim::C) == 0) {
 255                 //
 256                 // HWC case
 257                 //
 258
 259                 swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
 260                 if (swWeights == nullptr) {
 261                     DataDesc newWeightsDesc({
 262                         kernelSizeX * kernelSizeY,
 263                         1,
 264                         output->desc().dim(Dim::C)});
 265
 266                     swWeights = _model->duplicateData(
 267                         weights,
 268                         "@SW",
 269                         newWeightsDesc,
 270                         std::make_shared<DepthDeconvolutionHWCWeightsContent>(
 271                             weights->content(),
 272                             kernelSizeX, kernelSizeY,
 273                             output->desc().dim(Dim::C)));
 274
 275                     weights->attrs().set<Data>("swWeights", swWeights);
 276                 }
 277             } else if (input->desc().dimsOrder().dimInd(Dim::C) == 2) {
 278                 //
 279                 // CHW case
 280                 //
 281
 282                 swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
 283                 if (swWeights == nullptr) {
 284                     DataDesc newWeightsDesc({
 285                         kernelSizeX * kernelSizeY,
 286                         1,
 287                         output->desc().dim(Dim::C)});
 288
 289                     swWeights = _model->duplicateData(
 290                         weights,
 291                         "@SW",
 292                         newWeightsDesc,
 293                         std::make_shared<DepthDeconvolutionCHWWeightsContent>(
 294                             weights->content(),
 295                             kernelSizeX, kernelSizeY,
 296                             output->desc().dim(Dim::C)));
 297
 298                     weights->attrs().set<Data>("swWeights", swWeights);
 299                 }
 300             }
 301         } else {
 302             swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
 303             if (swWeights == nullptr) {
 304                 DataDesc newWeightsDesc({
 305                     kernelSizeX * kernelSizeY,
 306                     input->desc().dim(Dim::C),
 307                     output->desc().dim(Dim::C)});
 308
 309                 swWeights = _model->duplicateData(
 310                     weights,
 311                     "@SW",
 312                     newWeightsDesc,
 313                     std::make_shared<DeconvolutionWeightsContent>(
 314                         weights->content(),
 315                         kernelSizeX, kernelSizeY,
 316                         input->desc().dim(Dim::C),
 317                         output->desc().dim(Dim::C)));
 318
 319                 weights->attrs().set<Data>("swWeights", swWeights);
 320             }
 321         }
 322
 323         IE_ASSERT(swWeights != nullptr);
 324
 325         _model->replaceStageInput(_inputEdges[1], swWeights);
 326     }
 327
 328     DataMap<BatchSupport> getBatchSupportInfoImpl() const  override {
 329         IE_ASSERT(_inputEdges.size() == 3);
 330         IE_ASSERT(_outputEdges.size() == 1);
 331
 332         auto input = _inputEdges[0]->input();
 333         auto weights = _inputEdges[1]->input();
 334         auto biases = _inputEdges[2]->input();
 335         auto output = _outputEdges[0]->output();
 336
 337         DataMap<BatchSupport> out;
 338
 339         IE_ASSERT(weights->usage() == DataUsage::Const);
 340         IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
 341
 342         out[input] = BatchSupport::Split;
 343         out[output] = BatchSupport::Split;
 344
 345         return out;
 346     }
 347
 348     void finalCheckImpl() const override {
 349     }
 350
 351     void serializeParamsImpl(BlobSerializer& serializer) const override {
 352         auto kernelSizeX = attrs().get<int>("kernelSizeX");
 353         auto kernelSizeY = attrs().get<int>("kernelSizeY");
 354         auto kernelStrideX = attrs().get<int>("kernelStrideX");
 355         auto kernelStrideY = attrs().get<int>("kernelStrideY");
 356         auto padLeft = attrs().get<int>("padLeft");
 357         auto padTop = attrs().get<int>("padTop");
 358         auto dilationX = attrs().get<int>("dilationX");
 359         auto dilationY = attrs().get<int>("dilationY");
 360
 361         serializer.append(static_cast<uint32_t>(kernelSizeX));
 362         serializer.append(static_cast<uint32_t>(kernelSizeY));
 363         serializer.append(static_cast<uint32_t>(kernelStrideX));
 364         serializer.append(static_cast<uint32_t>(kernelStrideY));
 365         serializer.append(static_cast<uint32_t>(padLeft));
 366         serializer.append(static_cast<uint32_t>(padTop));
 367         serializer.append(static_cast<uint32_t>(dilationX));
 368         serializer.append(static_cast<uint32_t>(dilationY));
 369     }
 370
 371     void serializeDataImpl(BlobSerializer& serializer) const override {
 372         IE_ASSERT(_inputEdges.size() == 3);
 373         IE_ASSERT(_outputEdges.size() == 1);
 374
 375         auto input = _inputEdges[0]->input();
 376         auto weights = _inputEdges[1]->input();
 377         auto biases = _inputEdges[2]->input();
 378         auto output = _outputEdges[0]->output();
 379
 380         input->serializeOldBuffer(handle_from_this(), serializer);
 381         output->serializeOldBuffer(handle_from_this(), serializer);
 382         weights->serializeOldBuffer(handle_from_this(), serializer);
 383
 384         if (!_tempBufferEdges.empty()) {
 385             _tempBufferEdges[0]->tempBuffer()->serializeOldBuffer(handle_from_this(), serializer);
 386         }
 387
 388         // TODO: remove this
 389         biases->serializeOldBuffer(handle_from_this(), serializer);
 390     }
 391 };
 392
 393 class PassImpl final : public Pass {
 394 public:
 395     explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
 396
 397     void run(const Model::Ptr& model) override;
 398
 399 private:
 400     StageBuilder::Ptr _stageBuilder;
 401 };
 402
 403 void PassImpl::run(const Model::Ptr& model) {
 404     VPU_PROFILE(swDeconvAdaptation);
 405
 406     for (const auto& stage : model->getStages()) {
 407         if (stage->type() != StageType::StubDeconv)
 408             continue;
 409
 410         auto input = stage->input(0);
 411         auto weights = stage->input(1);
 412         auto biases = stage->input(2);
 413         auto output = stage->output(0);
 414
 415         auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
 416         auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
 417         auto kernelStrideX = stage->attrs().get<int>("kernelStrideX");
 418         auto kernelStrideY = stage->attrs().get<int>("kernelStrideY");
 419         auto padLeft = stage->attrs().get<int>("padLeft");
 420         auto padRight = stage->attrs().get<int>("padRight");
 421         auto padTop = stage->attrs().get<int>("padTop");
 422         auto padBottom = stage->attrs().get<int>("padBottom");
 423         auto dilationX = stage->attrs().get<int>("dilationX");
 424         auto dilationY = stage->attrs().get<int>("dilationY");
 425         auto groupSize = stage->attrs().get<int>("groupSize");
 426
 427         model->disconnectStageDatas(stage);
 428
 429         if (groupSize == 0 ||
 430             (groupSize > input->desc().dim(Dim::C)) ||
 431             (input->desc().dim(Dim::C) % groupSize != 0) ||
 432             (groupSize > output->desc().dim(Dim::C)) ||
 433             (output->desc().dim(Dim::C) % groupSize != 0)) {
 434             VPU_THROW_EXCEPTION << "DeconvolutionLayer has invalid group value";
 435         }
 436
 437         if (groupSize == 1) {
 438             if (biases->usage() != DataUsage::Fake) {
 439                 auto tempOutput = model->duplicateData(
 440                     output,
 441                     "@temp");
 442
 443                 _stageBuilder->addBiasStage(
 444                     model,
 445                     stage->name() + "@biases",
 446                     stage->origLayer(),
 447                     tempOutput, biases,
 448                     output);
 449
 450                 output = tempOutput;
 451             }
 452
 453             auto swStage = model->addNewStage<DeconvStage>(
 454                 stage->name(),
 455                 StageType::Deconvolution,
 456                 stage->origLayer(),
 457                 {input, weights, biases},
 458                 {output});
 459
 460             swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
 461             swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
 462
 463             swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
 464             swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
 465
 466             swStage->attrs().set<int>("padLeft", padLeft);
 467             swStage->attrs().set<int>("padRight", padRight);
 468             swStage->attrs().set<int>("padTop", padTop);
 469             swStage->attrs().set<int>("padBottom", padBottom);
 470
 471             swStage->attrs().set<int>("dilationX", dilationX);
 472             swStage->attrs().set<int>("dilationY", dilationY);
 473         } else if (groupSize == input->desc().dim(Dim::C) &&
 474                    groupSize == output->desc().dim(Dim::C)) {
 475             if (biases->usage() != DataUsage::Fake) {
 476                 auto tempOutput = model->duplicateData(
 477                     output,
 478                     "@temp");
 479
 480                 _stageBuilder->addBiasStage(
 481                     model,
 482                     stage->name() + "@biases",
 483                     stage->origLayer(),
 484                     tempOutput, biases,
 485                     output);
 486
 487                 output = tempOutput;
 488             }
 489
 490             auto swStage = model->addNewStage<DeconvStage>(
 491                 stage->name(),
 492                 StageType::DepthDeconv,
 493                 stage->origLayer(),
 494                 {input, weights, biases},
 495                 {output});
 496
 497             swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
 498             swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
 499
 500             swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
 501             swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
 502
 503             swStage->attrs().set<int>("padLeft", padLeft);
 504             swStage->attrs().set<int>("padRight", padRight);
 505             swStage->attrs().set<int>("padTop", padTop);
 506             swStage->attrs().set<int>("padBottom", padBottom);
 507
 508             swStage->attrs().set<int>("dilationX", dilationX);
 509             swStage->attrs().set<int>("dilationY", dilationY);
 510         } else {
 511             VPU_THROW_EXCEPTION << "Internal error : grouped deconvolution was not processed";
 512         }
 513
 514         model->removeStage(stage);
 515     }
 516 }
 517
 518 }  // namespace
 519
 520 Pass::Ptr PassManager::swDeconvAdaptation() {
 521     return std::make_shared<PassImpl>(_stageBuilder);
 522 }
 523
 524 }  // namespace vpu