Publishing 2019 R1.1 content and Myriad plugin sources (#162)
[platform/upstream/dldt.git] / inference-engine / src / vpu / graph_transformer / src / passes / sw_deconv_adaptation.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include <vpu/pass_manager.hpp>
6
7 #include <vector>
8 #include <string>
9 #include <memory>
10 #include <unordered_set>
11 #include <set>
12
13 #include <ie_parallel.hpp>
14
15 #include <vpu/sw/utility.hpp>
16 #include <vpu/utils/numeric.hpp>
17
18 namespace vpu {
19
20 namespace {
21
22 void depthDeconvolutionRelayoutCHW(
23         const fp16_t* src, int src_size,
24         fp16_t* dst, int dst_size,
25         int KX, int KY,
26         int channels) {
27     ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
28         int iidx = c * KX * KY + ky * KX + kx;
29         IE_ASSERT(iidx >= 0 && iidx < src_size);
30
31         int inv_kx = KX - kx - 1;
32         int inv_ky = KY - ky - 1;
33         int oidx = c * KX * KY + inv_ky * KX + inv_kx;
34         IE_ASSERT(oidx >= 0 && oidx < dst_size);
35
36         dst[oidx] = src[iidx];
37     });
38 }
39
40 class DepthDeconvolutionCHWWeightsContent final : public CalculatedDataContent {
41 public:
42     DepthDeconvolutionCHWWeightsContent(
43             const DataContent::Ptr& origContent,
44             int KX, int KY, int channels) :
45             CalculatedDataContent({origContent}),
46             _KX(KX), _KY(KY), _channels(channels) {
47     }
48
49 protected:
50     void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
51         VPU_PROFILE(DepthDeconvolutionCHWWeightsContent);
52         depthDeconvolutionRelayoutCHW(
53             baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
54             static_cast<fp16_t*>(tempBuf), _desc.totalDimSize(),
55             _KX, _KY, _channels);
56     }
57
58 private:
59     int _KX;
60     int _KY;
61     int _channels;
62 };
63
64 void depthDeconvolutionRelayoutHWC(
65         const fp16_t* src, int src_size,
66         fp16_t* dst, int dst_size,
67         int KX, int KY,
68         int channels) {
69     ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
70         int iidx = c * KX * KY + ky * KX + kx;
71         IE_ASSERT(iidx < src_size);
72
73         int inv_kx = KX - kx - 1;
74         int inv_ky = KY - ky - 1;
75         int oidx = inv_ky * KX * channels + inv_kx * channels + c;
76         IE_ASSERT(oidx < dst_size);
77
78         dst[oidx] = src[iidx];
79     });
80 }
81
82 class DepthDeconvolutionHWCWeightsContent final : public CalculatedDataContent {
83 public:
84     DepthDeconvolutionHWCWeightsContent(
85             const DataContent::Ptr& origContent,
86             int KX, int KY, int channels) :
87             CalculatedDataContent({origContent}),
88             _KX(KX), _KY(KY), _channels(channels) {
89     }
90
91 protected:
92     void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
93         VPU_PROFILE(DepthDeconvolutionHWCWeightsContent);
94         depthDeconvolutionRelayoutHWC(
95             baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
96             static_cast<fp16_t*>(tempBuf), _desc.totalDimSize(),
97             _KX, _KY, _channels);
98     }
99
100 private:
101     int _KX;
102     int _KY;
103     int _channels;
104 };
105
106 void deconvolutionRelayout(
107     const fp16_t* src, int src_size,
108     fp16_t* dst, int dst_size,
109     int KX, int KY,
110     int IC, int OC) {
111     ie::parallel_for4d(OC, IC, KY, KX, [=](int oc, int ic, int ky, int kx) {
112         int iidx = ic * OC * KY * KX
113                  + oc * KY * KX
114                  + ky * KX
115                  + kx;
116         IE_ASSERT(iidx >= 0 && iidx < src_size);
117
118         int inv_kx = KX - kx - 1;
119         int inv_ky = KY - ky - 1;
120         int oidx = oc * IC * KY * KX
121                  + ic * KY * KX
122                  + inv_ky * KX
123                  + inv_kx;
124         IE_ASSERT(oidx >=  0 && oidx < dst_size);
125
126         dst[oidx] = src[iidx];
127     });
128 }
129
130 class DeconvolutionWeightsContent final : public CalculatedDataContent {
131 public:
132     DeconvolutionWeightsContent(
133             const DataContent::Ptr& origContent,
134             int KX, int KY,
135             int IC, int OC) :
136             CalculatedDataContent({origContent}),
137             _KX(KX), _KY(KY),
138             _IC(IC), _OC(OC) {
139     }
140
141 protected:
142     size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
143         return 2 * _desc.totalDimSize() * sizeof(fp16_t);
144     }
145
146
147     void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
148         VPU_PROFILE(DeconvolutionWeightsContent);
149
150         auto dstPtr = static_cast<fp16_t*>(tempBuf);
151         auto dstPtr2 = dstPtr + _desc.totalDimSize();
152
153         deconvolutionRelayout(
154             baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
155             dstPtr2, _desc.totalDimSize(),
156             _KX, _KY,
157             _IC, _OC);
158
159         kchw_to_hwkc(dstPtr2, dstPtr, _desc);
160     }
161
162 private:
163     int _KX;
164     int _KY;
165     int _IC;
166     int _OC;
167 };
168
169 class DeconvStage final : public StageNode {
170 private:
171     StagePtr cloneImpl() const override {
172         return std::make_shared<DeconvStage>(*this);
173     }
174
175     DataMap<float> propagateScaleFactorsImpl(
176             const DataMap<float>&,
177             ScalePropagationStep) override {
178         VPU_THROW_EXCEPTION << "Must never be called";
179     }
180
181     DataMap<DimsOrder> propagateDataOrderImpl() const override {
182         IE_ASSERT(_inputEdges.size() == 3);
183         IE_ASSERT(_outputEdges.size() == 1);
184
185         auto input = _inputEdges[0]->input();
186         auto weights = _inputEdges[1]->input();
187         auto output = _outputEdges[0]->output();
188
189         auto finalOrder = input->desc().dimsOrder();
190         if (finalOrder.dimInd(Dim::C) == 1) {
191             // HCW -> CHW
192             finalOrder.moveDim(Dim::C, 2);
193         }
194
195         DataMap<DimsOrder> out;
196
197         if (_type == StageType::DepthDeconv) {
198             if (finalOrder != input->desc().dimsOrder()) {
199                 out[input] = finalOrder;
200             }
201             out[output] = finalOrder;
202         } else {
203             out[input] = finalOrder.createMovedDim(Dim::C, 0);
204             out[output] = finalOrder.createMovedDim(Dim::C, 0);
205         }
206
207         return out;
208     }
209
210     DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
211         IE_ASSERT(_inputEdges.size() == 3);
212         IE_ASSERT(_outputEdges.size() == 1);
213
214         auto input = _inputEdges[0]->input();
215         auto weights = _inputEdges[1]->input();
216         auto output = _outputEdges[0]->output();
217
218         auto finalOrder = input->desc().dimsOrder();
219         if (finalOrder.dimInd(Dim::C) == 1) {
220             // HCW -> CHW
221             finalOrder.moveDim(Dim::C, 2);
222         }
223
224         DataMap<StridesRequirement> out;
225
226         if (_type == StageType::DepthDeconv) {
227             if (finalOrder.dimInd(Dim::C) == 0) {
228                 // HWC
229                 out[input] = StridesRequirement::compact();
230                 out[output] = StridesRequirement::compact();
231             }
232         } else {
233             out[input] = StridesRequirement::compact();
234             out[output] = StridesRequirement::compact();
235         }
236
237         return out;
238     }
239
240     void finalizeDataLayoutImpl() override {
241         IE_ASSERT(_inputEdges.size() == 3);
242         IE_ASSERT(_outputEdges.size() == 1);
243
244         auto input = _inputEdges[0]->input();
245         auto weights = _inputEdges[1]->input();
246         auto output = _outputEdges[0]->output();
247
248         auto kernelSizeX = attrs().get<int>("kernelSizeX");
249         auto kernelSizeY = attrs().get<int>("kernelSizeY");
250
251         Data swWeights;
252
253         if (_type == StageType::DepthDeconv) {
254             if (input->desc().dimsOrder().dimInd(Dim::C) == 0) {
255                 //
256                 // HWC case
257                 //
258
259                 swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
260                 if (swWeights == nullptr) {
261                     DataDesc newWeightsDesc({
262                         kernelSizeX * kernelSizeY,
263                         1,
264                         output->desc().dim(Dim::C)});
265
266                     swWeights = _model->duplicateData(
267                         weights,
268                         "@SW",
269                         newWeightsDesc,
270                         std::make_shared<DepthDeconvolutionHWCWeightsContent>(
271                             weights->content(),
272                             kernelSizeX, kernelSizeY,
273                             output->desc().dim(Dim::C)));
274
275                     weights->attrs().set<Data>("swWeights", swWeights);
276                 }
277             } else if (input->desc().dimsOrder().dimInd(Dim::C) == 2) {
278                 //
279                 // CHW case
280                 //
281
282                 swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
283                 if (swWeights == nullptr) {
284                     DataDesc newWeightsDesc({
285                         kernelSizeX * kernelSizeY,
286                         1,
287                         output->desc().dim(Dim::C)});
288
289                     swWeights = _model->duplicateData(
290                         weights,
291                         "@SW",
292                         newWeightsDesc,
293                         std::make_shared<DepthDeconvolutionCHWWeightsContent>(
294                             weights->content(),
295                             kernelSizeX, kernelSizeY,
296                             output->desc().dim(Dim::C)));
297
298                     weights->attrs().set<Data>("swWeights", swWeights);
299                 }
300             }
301         } else {
302             swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
303             if (swWeights == nullptr) {
304                 DataDesc newWeightsDesc({
305                     kernelSizeX * kernelSizeY,
306                     input->desc().dim(Dim::C),
307                     output->desc().dim(Dim::C)});
308
309                 swWeights = _model->duplicateData(
310                     weights,
311                     "@SW",
312                     newWeightsDesc,
313                     std::make_shared<DeconvolutionWeightsContent>(
314                         weights->content(),
315                         kernelSizeX, kernelSizeY,
316                         input->desc().dim(Dim::C),
317                         output->desc().dim(Dim::C)));
318
319                 weights->attrs().set<Data>("swWeights", swWeights);
320             }
321         }
322
323         IE_ASSERT(swWeights != nullptr);
324
325         _model->replaceStageInput(_inputEdges[1], swWeights);
326     }
327
328     DataMap<BatchSupport> getBatchSupportInfoImpl() const  override {
329         IE_ASSERT(_inputEdges.size() == 3);
330         IE_ASSERT(_outputEdges.size() == 1);
331
332         auto input = _inputEdges[0]->input();
333         auto weights = _inputEdges[1]->input();
334         auto biases = _inputEdges[2]->input();
335         auto output = _outputEdges[0]->output();
336
337         DataMap<BatchSupport> out;
338
339         IE_ASSERT(weights->usage() == DataUsage::Const);
340         IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
341
342         out[input] = BatchSupport::Split;
343         out[output] = BatchSupport::Split;
344
345         return out;
346     }
347
348     void finalCheckImpl() const override {
349     }
350
351     void serializeParamsImpl(BlobSerializer& serializer) const override {
352         auto kernelSizeX = attrs().get<int>("kernelSizeX");
353         auto kernelSizeY = attrs().get<int>("kernelSizeY");
354         auto kernelStrideX = attrs().get<int>("kernelStrideX");
355         auto kernelStrideY = attrs().get<int>("kernelStrideY");
356         auto padLeft = attrs().get<int>("padLeft");
357         auto padTop = attrs().get<int>("padTop");
358         auto dilationX = attrs().get<int>("dilationX");
359         auto dilationY = attrs().get<int>("dilationY");
360
361         serializer.append(static_cast<uint32_t>(kernelSizeX));
362         serializer.append(static_cast<uint32_t>(kernelSizeY));
363         serializer.append(static_cast<uint32_t>(kernelStrideX));
364         serializer.append(static_cast<uint32_t>(kernelStrideY));
365         serializer.append(static_cast<uint32_t>(padLeft));
366         serializer.append(static_cast<uint32_t>(padTop));
367         serializer.append(static_cast<uint32_t>(dilationX));
368         serializer.append(static_cast<uint32_t>(dilationY));
369     }
370
371     void serializeDataImpl(BlobSerializer& serializer) const override {
372         IE_ASSERT(_inputEdges.size() == 3);
373         IE_ASSERT(_outputEdges.size() == 1);
374
375         auto input = _inputEdges[0]->input();
376         auto weights = _inputEdges[1]->input();
377         auto biases = _inputEdges[2]->input();
378         auto output = _outputEdges[0]->output();
379
380         input->serializeOldBuffer(handle_from_this(), serializer);
381         output->serializeOldBuffer(handle_from_this(), serializer);
382         weights->serializeOldBuffer(handle_from_this(), serializer);
383
384         if (!_tempBufferEdges.empty()) {
385             _tempBufferEdges[0]->tempBuffer()->serializeOldBuffer(handle_from_this(), serializer);
386         }
387
388         // TODO: remove this
389         biases->serializeOldBuffer(handle_from_this(), serializer);
390     }
391 };
392
393 class PassImpl final : public Pass {
394 public:
395     explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
396
397     void run(const Model::Ptr& model) override;
398
399 private:
400     StageBuilder::Ptr _stageBuilder;
401 };
402
403 void PassImpl::run(const Model::Ptr& model) {
404     VPU_PROFILE(swDeconvAdaptation);
405
406     for (const auto& stage : model->getStages()) {
407         if (stage->type() != StageType::StubDeconv)
408             continue;
409
410         auto input = stage->input(0);
411         auto weights = stage->input(1);
412         auto biases = stage->input(2);
413         auto output = stage->output(0);
414
415         auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
416         auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
417         auto kernelStrideX = stage->attrs().get<int>("kernelStrideX");
418         auto kernelStrideY = stage->attrs().get<int>("kernelStrideY");
419         auto padLeft = stage->attrs().get<int>("padLeft");
420         auto padRight = stage->attrs().get<int>("padRight");
421         auto padTop = stage->attrs().get<int>("padTop");
422         auto padBottom = stage->attrs().get<int>("padBottom");
423         auto dilationX = stage->attrs().get<int>("dilationX");
424         auto dilationY = stage->attrs().get<int>("dilationY");
425         auto groupSize = stage->attrs().get<int>("groupSize");
426
427         model->disconnectStageDatas(stage);
428
429         if (groupSize == 0 ||
430             (groupSize > input->desc().dim(Dim::C)) ||
431             (input->desc().dim(Dim::C) % groupSize != 0) ||
432             (groupSize > output->desc().dim(Dim::C)) ||
433             (output->desc().dim(Dim::C) % groupSize != 0)) {
434             VPU_THROW_EXCEPTION << "DeconvolutionLayer has invalid group value";
435         }
436
437         if (groupSize == 1) {
438             if (biases->usage() != DataUsage::Fake) {
439                 auto tempOutput = model->duplicateData(
440                     output,
441                     "@temp");
442
443                 _stageBuilder->addBiasStage(
444                     model,
445                     stage->name() + "@biases",
446                     stage->origLayer(),
447                     tempOutput, biases,
448                     output);
449
450                 output = tempOutput;
451             }
452
453             auto swStage = model->addNewStage<DeconvStage>(
454                 stage->name(),
455                 StageType::Deconvolution,
456                 stage->origLayer(),
457                 {input, weights, biases},
458                 {output});
459
460             swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
461             swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
462
463             swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
464             swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
465
466             swStage->attrs().set<int>("padLeft", padLeft);
467             swStage->attrs().set<int>("padRight", padRight);
468             swStage->attrs().set<int>("padTop", padTop);
469             swStage->attrs().set<int>("padBottom", padBottom);
470
471             swStage->attrs().set<int>("dilationX", dilationX);
472             swStage->attrs().set<int>("dilationY", dilationY);
473         } else if (groupSize == input->desc().dim(Dim::C) &&
474                    groupSize == output->desc().dim(Dim::C)) {
475             if (biases->usage() != DataUsage::Fake) {
476                 auto tempOutput = model->duplicateData(
477                     output,
478                     "@temp");
479
480                 _stageBuilder->addBiasStage(
481                     model,
482                     stage->name() + "@biases",
483                     stage->origLayer(),
484                     tempOutput, biases,
485                     output);
486
487                 output = tempOutput;
488             }
489
490             auto swStage = model->addNewStage<DeconvStage>(
491                 stage->name(),
492                 StageType::DepthDeconv,
493                 stage->origLayer(),
494                 {input, weights, biases},
495                 {output});
496
497             swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
498             swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
499
500             swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
501             swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
502
503             swStage->attrs().set<int>("padLeft", padLeft);
504             swStage->attrs().set<int>("padRight", padRight);
505             swStage->attrs().set<int>("padTop", padTop);
506             swStage->attrs().set<int>("padBottom", padBottom);
507
508             swStage->attrs().set<int>("dilationX", dilationX);
509             swStage->attrs().set<int>("dilationY", dilationY);
510         } else {
511             VPU_THROW_EXCEPTION << "Internal error : grouped deconvolution was not processed";
512         }
513
514         model->removeStage(stage);
515     }
516 }
517
518 }  // namespace
519
520 Pass::Ptr PassManager::swDeconvAdaptation() {
521     return std::make_shared<PassImpl>(_stageBuilder);
522 }
523
524 }  // namespace vpu