1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include <vpu/pass_manager.hpp>
12 #include <vpu/compile_env.hpp>
13 #include <vpu/hw/utility.hpp>
19 class PassImpl final : public Pass {
21 explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
23 void run(const Model::Ptr& model) override;
26 StageBuilder::Ptr _stageBuilder;
29 void PassImpl::run(const Model::Ptr& model) {
30 VPU_PROFILE(splitHwConvAndPool);
32 const auto& env = CompileEnv::get();
34 for (const auto& convStage : model->getStages()) {
35 if (convStage == nullptr) {
39 if (convStage->type() != StageType::StubConv) {
43 auto convHW = convStage->attrs().getOrDefault<bool>("tryHW", false);
48 auto convInput = convStage->input(0);
49 auto convWeights = convStage->input(1);
50 auto convBiases = convStage->input(2);
51 auto convOutput = convStage->output(0);
53 if (convOutput->usage() != DataUsage::Intermediate) {
57 // TODO : better estimation?
58 auto outBufSize = calculateHwBufferSize(convOutput->desc().dims());
59 if (outBufSize <= env.resources.cmxLimit) {
63 if (convOutput->numConsumers() != 1) {
67 auto poolStage = convOutput->singleConsumer();
68 if (poolStage->type() != StageType::StubAvgPool &&
69 poolStage->type() != StageType::StubMaxPool) {
73 auto poolHW = poolStage->attrs().getOrDefault<bool>("tryHW", false);
78 auto convKernelSizeX = convStage->attrs().get<int>("kernelSizeX");
79 auto convKernelSizeY = convStage->attrs().get<int>("kernelSizeY");
81 auto poolOutput = poolStage->output(0);
83 // TODO : better estimation?
85 std::array<int, 3> TILE_SIZE_CANDIDATES{{128, 64, 32}};
86 for (auto curTileSize : TILE_SIZE_CANDIDATES) {
87 if (convOutput->desc().dim(Dim::C) >= curTileSize &&
88 convOutput->desc().dim(Dim::C) % curTileSize == 0) {
90 curOutDims.set(Dim::W, convOutput->desc().dim(Dim::W));
91 curOutDims.set(Dim::H, convOutput->desc().dim(Dim::H));
92 curOutDims.set(Dim::C, curTileSize);
94 auto curOutBufSize = calculateHwBufferSize(curOutDims);
95 if (curOutBufSize <= env.resources.cmxLimit) {
96 tileSize = curTileSize;
105 auto numTiles = (convOutput->desc().dim(Dim::C) + tileSize - 1) / tileSize;
107 model->disconnectStageDatas(convStage);
108 model->disconnectStageDatas(poolStage);
110 DataVector subOutputs(numTiles);
113 for (int tileInd = 0; tileInd < numTiles; ++tileInd) {
114 auto postfix = formatString("@tile=%d/%d", tileInd + 1, numTiles);
116 auto curTileSize = tileInd != numTiles - 1 ? tileSize : convOutput->desc().dim(Dim::C) - tileOffset;
118 auto convOutputTileDesc = convOutput->desc();
119 convOutputTileDesc.setDim(Dim::C, curTileSize);
121 auto convOutputTile = model->duplicateData(
126 auto poolOutputTileDesc = poolOutput->desc();
127 poolOutputTileDesc.setDim(Dim::C, curTileSize);
129 auto poolOutputTile = model->duplicateData(
136 auto content = convWeights->content();
137 IE_ASSERT(content != nullptr);
139 auto origWeights = content->get<fp16_t>();
140 IE_ASSERT(origWeights != nullptr);
142 auto kernWxH = convKernelSizeX * convKernelSizeY;
143 size_t newWeightsSize = kernWxH * convInput->desc().dim(Dim::C) * tileSize;
145 auto newWeightsBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {newWeightsSize});
146 newWeightsBlob->allocate();
148 auto inPtr = origWeights + kernWxH * convInput->desc().dim(Dim::C) * tileInd * tileSize;
149 auto outPtr = newWeightsBlob->buffer().as<fp16_t*>();
151 std::copy_n(inPtr, newWeightsSize, outPtr);
153 tileWeights = model->duplicateData(
156 DataDesc({convKernelSizeX, convKernelSizeY, convInput->desc().dim(Dim::C), tileSize}),
157 ieBlobContent(newWeightsBlob));
160 auto tileBiases = convBiases;
161 if (convBiases->usage() != DataUsage::Fake) {
162 auto content = convBiases->content();
163 IE_ASSERT(content != nullptr);
165 auto origBiases = content->get<fp16_t>();
166 IE_ASSERT(origBiases != nullptr);
168 auto newBiasesBlob = ie::make_shared_blob<fp16_t>(ie::Precision::FP16, ie::Layout::C, {static_cast<size_t>(tileSize)});
169 newBiasesBlob->allocate();
171 auto inPtr = origBiases + tileInd * tileSize;
172 auto outPtr = newBiasesBlob->buffer().as<fp16_t*>();
174 std::copy_n(inPtr, tileSize, outPtr);
176 tileBiases = model->duplicateData(
179 DataDesc({tileSize}),
180 ieBlobContent(newBiasesBlob));
183 model->duplicateStage(
184 convStage->name() + postfix,
186 {convInput, tileWeights, tileBiases},
189 model->duplicateStage(
190 poolStage->name() + postfix,
195 subOutputs[tileInd] = poolOutputTile;
197 tileOffset += curTileSize;
200 _stageBuilder->addConcatStage(
202 poolStage->name() + "@concat",
203 poolStage->origLayer(),
208 model->removeStage(convStage);
209 model->removeStage(poolStage);
215 Pass::Ptr PassManager::splitHwConvAndPool() {
216 return std::make_shared<PassImpl>(_stageBuilder);