1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include <vpu/pass_manager.hpp>
10 #include <unordered_set>
13 #include <ie_parallel.hpp>
15 #include <vpu/sw/utility.hpp>
16 #include <vpu/utils/numeric.hpp>
22 void depthDeconvolutionRelayoutCHW(
23 const fp16_t* src, int src_size,
24 fp16_t* dst, int dst_size,
27 ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
28 int iidx = c * KX * KY + ky * KX + kx;
29 IE_ASSERT(iidx >= 0 && iidx < src_size);
31 int inv_kx = KX - kx - 1;
32 int inv_ky = KY - ky - 1;
33 int oidx = c * KX * KY + inv_ky * KX + inv_kx;
34 IE_ASSERT(oidx >= 0 && oidx < dst_size);
36 dst[oidx] = src[iidx];
40 class DepthDeconvolutionCHWWeightsContent final : public CalculatedDataContent {
42 DepthDeconvolutionCHWWeightsContent(
43 const DataContent::Ptr& origContent,
44 int KX, int KY, int channels) :
45 CalculatedDataContent({origContent}),
46 _KX(KX), _KY(KY), _channels(channels) {
50 void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
51 VPU_PROFILE(DepthDeconvolutionCHWWeightsContent);
52 depthDeconvolutionRelayoutCHW(
53 baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
54 static_cast<fp16_t*>(tempBuf), _desc.totalDimSize(),
64 void depthDeconvolutionRelayoutHWC(
65 const fp16_t* src, int src_size,
66 fp16_t* dst, int dst_size,
69 ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
70 int iidx = c * KX * KY + ky * KX + kx;
71 IE_ASSERT(iidx < src_size);
73 int inv_kx = KX - kx - 1;
74 int inv_ky = KY - ky - 1;
75 int oidx = inv_ky * KX * channels + inv_kx * channels + c;
76 IE_ASSERT(oidx < dst_size);
78 dst[oidx] = src[iidx];
82 class DepthDeconvolutionHWCWeightsContent final : public CalculatedDataContent {
84 DepthDeconvolutionHWCWeightsContent(
85 const DataContent::Ptr& origContent,
86 int KX, int KY, int channels) :
87 CalculatedDataContent({origContent}),
88 _KX(KX), _KY(KY), _channels(channels) {
92 void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
93 VPU_PROFILE(DepthDeconvolutionHWCWeightsContent);
94 depthDeconvolutionRelayoutHWC(
95 baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
96 static_cast<fp16_t*>(tempBuf), _desc.totalDimSize(),
106 void deconvolutionRelayout(
107 const fp16_t* src, int src_size,
108 fp16_t* dst, int dst_size,
111 ie::parallel_for4d(OC, IC, KY, KX, [=](int oc, int ic, int ky, int kx) {
112 int iidx = ic * OC * KY * KX
116 IE_ASSERT(iidx >= 0 && iidx < src_size);
118 int inv_kx = KX - kx - 1;
119 int inv_ky = KY - ky - 1;
120 int oidx = oc * IC * KY * KX
124 IE_ASSERT(oidx >= 0 && oidx < dst_size);
126 dst[oidx] = src[iidx];
130 class DeconvolutionWeightsContent final : public CalculatedDataContent {
132 DeconvolutionWeightsContent(
133 const DataContent::Ptr& origContent,
136 CalculatedDataContent({origContent}),
142 size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
143 return 2 * _desc.totalDimSize() * sizeof(fp16_t);
147 void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
148 VPU_PROFILE(DeconvolutionWeightsContent);
150 auto dstPtr = static_cast<fp16_t*>(tempBuf);
151 auto dstPtr2 = dstPtr + _desc.totalDimSize();
153 deconvolutionRelayout(
154 baseContents[0]->get<fp16_t>(), _desc.totalDimSize(),
155 dstPtr2, _desc.totalDimSize(),
159 kchw_to_hwkc(dstPtr2, dstPtr, _desc);
169 class DeconvStage final : public StageNode {
171 StagePtr cloneImpl() const override {
172 return std::make_shared<DeconvStage>(*this);
175 DataMap<float> propagateScaleFactorsImpl(
176 const DataMap<float>&,
177 ScalePropagationStep) override {
178 VPU_THROW_EXCEPTION << "Must never be called";
181 DataMap<DimsOrder> propagateDataOrderImpl() const override {
182 IE_ASSERT(_inputEdges.size() == 3);
183 IE_ASSERT(_outputEdges.size() == 1);
185 auto input = _inputEdges[0]->input();
186 auto weights = _inputEdges[1]->input();
187 auto output = _outputEdges[0]->output();
189 auto finalOrder = input->desc().dimsOrder();
190 if (finalOrder.dimInd(Dim::C) == 1) {
192 finalOrder.moveDim(Dim::C, 2);
195 DataMap<DimsOrder> out;
197 if (_type == StageType::DepthDeconv) {
198 if (finalOrder != input->desc().dimsOrder()) {
199 out[input] = finalOrder;
201 out[output] = finalOrder;
203 out[input] = finalOrder.createMovedDim(Dim::C, 0);
204 out[output] = finalOrder.createMovedDim(Dim::C, 0);
210 DataMap<StridesRequirement> getDataStridesRequirementsImpl() const override {
211 IE_ASSERT(_inputEdges.size() == 3);
212 IE_ASSERT(_outputEdges.size() == 1);
214 auto input = _inputEdges[0]->input();
215 auto weights = _inputEdges[1]->input();
216 auto output = _outputEdges[0]->output();
218 auto finalOrder = input->desc().dimsOrder();
219 if (finalOrder.dimInd(Dim::C) == 1) {
221 finalOrder.moveDim(Dim::C, 2);
224 DataMap<StridesRequirement> out;
226 if (_type == StageType::DepthDeconv) {
227 if (finalOrder.dimInd(Dim::C) == 0) {
229 out[input] = StridesRequirement::compact();
230 out[output] = StridesRequirement::compact();
233 out[input] = StridesRequirement::compact();
234 out[output] = StridesRequirement::compact();
240 void finalizeDataLayoutImpl() override {
241 IE_ASSERT(_inputEdges.size() == 3);
242 IE_ASSERT(_outputEdges.size() == 1);
244 auto input = _inputEdges[0]->input();
245 auto weights = _inputEdges[1]->input();
246 auto output = _outputEdges[0]->output();
248 auto kernelSizeX = attrs().get<int>("kernelSizeX");
249 auto kernelSizeY = attrs().get<int>("kernelSizeY");
253 if (_type == StageType::DepthDeconv) {
254 if (input->desc().dimsOrder().dimInd(Dim::C) == 0) {
259 swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
260 if (swWeights == nullptr) {
261 DataDesc newWeightsDesc({
262 kernelSizeX * kernelSizeY,
264 output->desc().dim(Dim::C)});
266 swWeights = _model->duplicateData(
270 std::make_shared<DepthDeconvolutionHWCWeightsContent>(
272 kernelSizeX, kernelSizeY,
273 output->desc().dim(Dim::C)));
275 weights->attrs().set<Data>("swWeights", swWeights);
277 } else if (input->desc().dimsOrder().dimInd(Dim::C) == 2) {
282 swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
283 if (swWeights == nullptr) {
284 DataDesc newWeightsDesc({
285 kernelSizeX * kernelSizeY,
287 output->desc().dim(Dim::C)});
289 swWeights = _model->duplicateData(
293 std::make_shared<DepthDeconvolutionCHWWeightsContent>(
295 kernelSizeX, kernelSizeY,
296 output->desc().dim(Dim::C)));
298 weights->attrs().set<Data>("swWeights", swWeights);
302 swWeights = weights->attrs().getOrDefault<Data>("swWeights", nullptr);
303 if (swWeights == nullptr) {
304 DataDesc newWeightsDesc({
305 kernelSizeX * kernelSizeY,
306 input->desc().dim(Dim::C),
307 output->desc().dim(Dim::C)});
309 swWeights = _model->duplicateData(
313 std::make_shared<DeconvolutionWeightsContent>(
315 kernelSizeX, kernelSizeY,
316 input->desc().dim(Dim::C),
317 output->desc().dim(Dim::C)));
319 weights->attrs().set<Data>("swWeights", swWeights);
323 IE_ASSERT(swWeights != nullptr);
325 _model->replaceStageInput(_inputEdges[1], swWeights);
328 DataMap<BatchSupport> getBatchSupportInfoImpl() const override {
329 IE_ASSERT(_inputEdges.size() == 3);
330 IE_ASSERT(_outputEdges.size() == 1);
332 auto input = _inputEdges[0]->input();
333 auto weights = _inputEdges[1]->input();
334 auto biases = _inputEdges[2]->input();
335 auto output = _outputEdges[0]->output();
337 DataMap<BatchSupport> out;
339 IE_ASSERT(weights->usage() == DataUsage::Const);
340 IE_ASSERT(biases->usage() == DataUsage::Const || biases->usage() == DataUsage::Fake);
342 out[input] = BatchSupport::Split;
343 out[output] = BatchSupport::Split;
348 void finalCheckImpl() const override {
351 void serializeParamsImpl(BlobSerializer& serializer) const override {
352 auto kernelSizeX = attrs().get<int>("kernelSizeX");
353 auto kernelSizeY = attrs().get<int>("kernelSizeY");
354 auto kernelStrideX = attrs().get<int>("kernelStrideX");
355 auto kernelStrideY = attrs().get<int>("kernelStrideY");
356 auto padLeft = attrs().get<int>("padLeft");
357 auto padTop = attrs().get<int>("padTop");
358 auto dilationX = attrs().get<int>("dilationX");
359 auto dilationY = attrs().get<int>("dilationY");
361 serializer.append(static_cast<uint32_t>(kernelSizeX));
362 serializer.append(static_cast<uint32_t>(kernelSizeY));
363 serializer.append(static_cast<uint32_t>(kernelStrideX));
364 serializer.append(static_cast<uint32_t>(kernelStrideY));
365 serializer.append(static_cast<uint32_t>(padLeft));
366 serializer.append(static_cast<uint32_t>(padTop));
367 serializer.append(static_cast<uint32_t>(dilationX));
368 serializer.append(static_cast<uint32_t>(dilationY));
371 void serializeDataImpl(BlobSerializer& serializer) const override {
372 IE_ASSERT(_inputEdges.size() == 3);
373 IE_ASSERT(_outputEdges.size() == 1);
375 auto input = _inputEdges[0]->input();
376 auto weights = _inputEdges[1]->input();
377 auto biases = _inputEdges[2]->input();
378 auto output = _outputEdges[0]->output();
380 input->serializeOldBuffer(handle_from_this(), serializer);
381 output->serializeOldBuffer(handle_from_this(), serializer);
382 weights->serializeOldBuffer(handle_from_this(), serializer);
384 if (!_tempBufferEdges.empty()) {
385 _tempBufferEdges[0]->tempBuffer()->serializeOldBuffer(handle_from_this(), serializer);
389 biases->serializeOldBuffer(handle_from_this(), serializer);
393 class PassImpl final : public Pass {
395 explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
397 void run(const Model::Ptr& model) override;
400 StageBuilder::Ptr _stageBuilder;
403 void PassImpl::run(const Model::Ptr& model) {
404 VPU_PROFILE(swDeconvAdaptation);
406 for (const auto& stage : model->getStages()) {
407 if (stage->type() != StageType::StubDeconv)
410 auto input = stage->input(0);
411 auto weights = stage->input(1);
412 auto biases = stage->input(2);
413 auto output = stage->output(0);
415 auto kernelSizeX = stage->attrs().get<int>("kernelSizeX");
416 auto kernelSizeY = stage->attrs().get<int>("kernelSizeY");
417 auto kernelStrideX = stage->attrs().get<int>("kernelStrideX");
418 auto kernelStrideY = stage->attrs().get<int>("kernelStrideY");
419 auto padLeft = stage->attrs().get<int>("padLeft");
420 auto padRight = stage->attrs().get<int>("padRight");
421 auto padTop = stage->attrs().get<int>("padTop");
422 auto padBottom = stage->attrs().get<int>("padBottom");
423 auto dilationX = stage->attrs().get<int>("dilationX");
424 auto dilationY = stage->attrs().get<int>("dilationY");
425 auto groupSize = stage->attrs().get<int>("groupSize");
427 model->disconnectStageDatas(stage);
429 if (groupSize == 0 ||
430 (groupSize > input->desc().dim(Dim::C)) ||
431 (input->desc().dim(Dim::C) % groupSize != 0) ||
432 (groupSize > output->desc().dim(Dim::C)) ||
433 (output->desc().dim(Dim::C) % groupSize != 0)) {
434 VPU_THROW_EXCEPTION << "DeconvolutionLayer has invalid group value";
437 if (groupSize == 1) {
438 if (biases->usage() != DataUsage::Fake) {
439 auto tempOutput = model->duplicateData(
443 _stageBuilder->addBiasStage(
445 stage->name() + "@biases",
453 auto swStage = model->addNewStage<DeconvStage>(
455 StageType::Deconvolution,
457 {input, weights, biases},
460 swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
461 swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
463 swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
464 swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
466 swStage->attrs().set<int>("padLeft", padLeft);
467 swStage->attrs().set<int>("padRight", padRight);
468 swStage->attrs().set<int>("padTop", padTop);
469 swStage->attrs().set<int>("padBottom", padBottom);
471 swStage->attrs().set<int>("dilationX", dilationX);
472 swStage->attrs().set<int>("dilationY", dilationY);
473 } else if (groupSize == input->desc().dim(Dim::C) &&
474 groupSize == output->desc().dim(Dim::C)) {
475 if (biases->usage() != DataUsage::Fake) {
476 auto tempOutput = model->duplicateData(
480 _stageBuilder->addBiasStage(
482 stage->name() + "@biases",
490 auto swStage = model->addNewStage<DeconvStage>(
492 StageType::DepthDeconv,
494 {input, weights, biases},
497 swStage->attrs().set<int>("kernelSizeX", kernelSizeX);
498 swStage->attrs().set<int>("kernelSizeY", kernelSizeY);
500 swStage->attrs().set<int>("kernelStrideX", kernelStrideX);
501 swStage->attrs().set<int>("kernelStrideY", kernelStrideY);
503 swStage->attrs().set<int>("padLeft", padLeft);
504 swStage->attrs().set<int>("padRight", padRight);
505 swStage->attrs().set<int>("padTop", padTop);
506 swStage->attrs().set<int>("padBottom", padBottom);
508 swStage->attrs().set<int>("dilationX", dilationX);
509 swStage->attrs().set<int>("dilationY", dilationY);
511 VPU_THROW_EXCEPTION << "Internal error : grouped deconvolution was not processed";
514 model->removeStage(stage);
520 Pass::Ptr PassManager::swDeconvAdaptation() {
521 return std::make_shared<PassImpl>(_stageBuilder);