Publishing 2019 R1.1 content and Myriad plugin sources (#162)
[platform/upstream/dldt.git] / inference-engine / src / vpu / graph_transformer / src / passes / split_hw_conv_and_pool.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include <vpu/pass_manager.hpp>
6
7 #include <vector>
8 #include <set>
9 #include <memory>
10 #include <array>
11
12 #include <vpu/compile_env.hpp>
13 #include <vpu/hw/utility.hpp>
14
15 namespace vpu {
16
17 namespace {
18
19 class PassImpl final : public Pass {
20 public:
21     explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
22
23     void run(const Model::Ptr& model) override;
24
25 private:
26     StageBuilder::Ptr _stageBuilder;
27 };
28
29 void PassImpl::run(const Model::Ptr& model) {
30     VPU_PROFILE(splitHwConvAndPool);
31
32     const auto& env = CompileEnv::get();
33
34     for (const auto& convStage : model->getStages()) {
35         if (convStage == nullptr) {
36             continue;
37         }
38
39         if (convStage->type() != StageType::StubConv) {
40             continue;
41         }
42
43         auto convHW = convStage->attrs().getOrDefault<bool>("tryHW", false);
44         if (!convHW) {
45             continue;
46         }
47
48         auto convInput = convStage->input(0);
49         auto convWeights = convStage->input(1);
50         auto convBiases = convStage->input(2);
51         auto convOutput = convStage->output(0);
52
53         if (convOutput->usage() != DataUsage::Intermediate) {
54             continue;
55         }
56
57         // TODO : better estimation?
58         auto outBufSize = calculateHwBufferSize(convOutput->desc().dims());
59         if (outBufSize <= env.resources.cmxLimit) {
60             continue;
61         }
62
63         if (convOutput->numConsumers() != 1) {
64             continue;
65         }
66
67         auto poolStage = convOutput->singleConsumer();
68         if (poolStage->type() != StageType::StubAvgPool &&
69             poolStage->type() != StageType::StubMaxPool) {
70             continue;
71         }
72
73         auto poolHW = poolStage->attrs().getOrDefault<bool>("tryHW", false);
74         if (!poolHW) {
75             continue;
76         }
77
78         auto convKernelSizeX = convStage->attrs().get<int>("kernelSizeX");
79         auto convKernelSizeY = convStage->attrs().get<int>("kernelSizeY");
80
81         auto poolOutput = poolStage->output(0);
82
83         // TODO : better estimation?
84         int tileSize = 0u;
85         std::array<int, 3> TILE_SIZE_CANDIDATES{{128, 64, 32}};
86         for (auto curTileSize : TILE_SIZE_CANDIDATES) {
87             if (convOutput->desc().dim(Dim::C) >= curTileSize &&
88                 convOutput->desc().dim(Dim::C) % curTileSize == 0) {
89                 DimValues curOutDims;
90                 curOutDims.set(Dim::W, convOutput->desc().dim(Dim::W));
91                 curOutDims.set(Dim::H, convOutput->desc().dim(Dim::H));
92                 curOutDims.set(Dim::C, curTileSize);
93
94                 auto curOutBufSize = calculateHwBufferSize(curOutDims);
95                 if (curOutBufSize <= env.resources.cmxLimit) {
96                     tileSize = curTileSize;
97                     break;
98                 }
99             }
100         }
101
102         if (tileSize == 0)
103             continue;
104
105         auto numTiles = (convOutput->desc().dim(Dim::C) + tileSize - 1) / tileSize;
106
107         model->disconnectStageDatas(convStage);
108         model->disconnectStageDatas(poolStage);
109
110         DataVector subOutputs(numTiles);
111
112         int tileOffset = 0;
113         for (int tileInd = 0; tileInd < numTiles; ++tileInd) {
114             auto postfix = formatString("@tile=%d/%d", tileInd + 1, numTiles);
115
116             auto curTileSize = tileInd != numTiles - 1 ? tileSize : convOutput->desc().dim(Dim::C) - tileOffset;
117
118             auto convOutputTileDesc = convOutput->desc();
119             convOutputTileDesc.setDim(Dim::C, curTileSize);
120
121             auto convOutputTile = model->duplicateData(
122                 convOutput,
123                 postfix,
124                 convOutputTileDesc);
125
126             auto poolOutputTileDesc = poolOutput->desc();
127             poolOutputTileDesc.setDim(Dim::C, curTileSize);
128
129             auto poolOutputTile = model->duplicateData(
130                 poolOutput,
131                 postfix,
132                 poolOutputTileDesc);
133
134             Data tileWeights;
135             {
136                 auto content = convWeights->content();
137                 IE_ASSERT(content != nullptr);
138
139                 auto origWeights = content->get<fp16_t>();
140                 IE_ASSERT(origWeights != nullptr);
141
142                 auto kernWxH = convKernelSizeX * convKernelSizeY;
143                 size_t newWeightsSize = kernWxH * convInput->desc().dim(Dim::C) * tileSize;
144
145                 auto newWeightsBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {newWeightsSize});
146                 newWeightsBlob->allocate();
147
148                 auto inPtr = origWeights + kernWxH * convInput->desc().dim(Dim::C) * tileInd * tileSize;
149                 auto outPtr = newWeightsBlob->buffer().as<fp16_t*>();
150
151                 std::copy_n(inPtr, newWeightsSize, outPtr);
152
153                 tileWeights = model->duplicateData(
154                     convWeights,
155                     postfix,
156                     DataDesc({convKernelSizeX, convKernelSizeY, convInput->desc().dim(Dim::C), tileSize}),
157                     ieBlobContent(newWeightsBlob));
158             }
159
160             auto tileBiases = convBiases;
161             if (convBiases->usage() != DataUsage::Fake) {
162                 auto content = convBiases->content();
163                 IE_ASSERT(content != nullptr);
164
165                 auto origBiases = content->get<fp16_t>();
166                 IE_ASSERT(origBiases != nullptr);
167
168                 auto newBiasesBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {static_cast<size_t>(tileSize)});
169                 newBiasesBlob->allocate();
170
171                 auto inPtr = origBiases + tileInd * tileSize;
172                 auto outPtr = newBiasesBlob->buffer().as<fp16_t*>();
173
174                 std::copy_n(inPtr, tileSize, outPtr);
175
176                 tileBiases = model->duplicateData(
177                     convBiases,
178                     postfix,
179                     DataDesc({tileSize}),
180                     ieBlobContent(newBiasesBlob));
181             }
182
183             model->duplicateStage(
184                 convStage->name() + postfix,
185                 convStage,
186                 {convInput, tileWeights, tileBiases},
187                 {convOutputTile});
188
189             model->duplicateStage(
190                 poolStage->name() + postfix,
191                 poolStage,
192                 {convOutputTile},
193                 {poolOutputTile});
194
195             subOutputs[tileInd] = poolOutputTile;
196
197             tileOffset += curTileSize;
198         }
199
200         _stageBuilder->addConcatStage(
201             model,
202             poolStage->name() + "@concat",
203             poolStage->origLayer(),
204             Dim::C,
205             subOutputs,
206             poolOutput);
207
208         model->removeStage(convStage);
209         model->removeStage(poolStage);
210     }
211 }
212
213 }  // namespace
214
215 Pass::Ptr PassManager::splitHwConvAndPool() {
216     return std::make_shared<PassImpl>(_stageBuilder);
217 }
218
219 }  // namespace vpu