Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / actual_kernels / convolution / convolution_kernel_imad_3x3.cpp
1 /*
2 // Copyright (c) 2018-2019 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "convolution_kernel_imad_3x3.h"
18 #include "kernel_selector_utils.h"
19 #include "common_tools.h"
20
21 //
22 // Kernel specific constants
23 //
24 #define SIMD_SIZE             16
25 // Threshold value to calculate the block size.
26 #define OUT_BLOCK_THRESHOLD   7
27 // For images 7x7 it's 7 (default), for 14x14 and above it's 14.
28 #define OUT_BLOCK_WIDTH       7
29 // For images 7x7 it's 1 (default), for 14x14 and above it's 2.
30 #define OUT_BLOCK_HEIGHT      1
31
32 static void getOutBlock_WH(size_t inW, size_t Stride, size_t Pad, size_t& outW, size_t& outH)
33 {
34     outW = OUT_BLOCK_WIDTH * 2;
35     outH = OUT_BLOCK_HEIGHT * 2;
36
37     if ((inW <= OUT_BLOCK_THRESHOLD) ||
38         (outW * Stride + Pad > SIMD_SIZE)) {
39         outW = OUT_BLOCK_WIDTH;
40         outH = OUT_BLOCK_HEIGHT;
41     }
42     if (outW * Stride + Pad > SIMD_SIZE) {
43         outW = outH = 4;
44     }
45
46     assert(outW * Stride + Pad <= SIMD_SIZE);
47 } // getOutBlock_WH
48
49 namespace kernel_selector {
50
51     ParamsKey ConvolutionKernel_imad_3x3::GetSupportedKey() const
52     {
53         ParamsKey k;
54         k.EnableInputDataType(Datatype::INT8);
55         k.EnableInputDataType(Datatype::UINT8);
56         k.EnableOutputDataType(Datatype::INT8);
57         k.EnableOutputDataType(Datatype::UINT8);
58         k.EnableInputWeightsType(WeightsType::INT8);
59         k.EnableInputWeightsType(WeightsType::UINT8);
60         k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
61         k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
62         k.EnableDifferentInputWeightsTypes();
63         k.EnableTensorOffset();
64         k.EnableTensorPitches();
65         k.EnableDilation();
66         k.EnableBiasPerFeature();
67         k.EnableNonBiasTerm();
68         k.EnableBatching();
69         k.EnableInt8Quantization();
70         k.EnableOutputCalibration();
71         k.DisableTuning();
72         return k;
73     }
74
75     KernelsData
76     ConvolutionKernel_imad_3x3::GetKernelsData(
77                                     const Params&          params,
78                                     const optional_params& options) const
79     {
80         return GetCommonKernelsData(params, options);
81     }
82
83     JitConstants
84     ConvolutionKernel_imad_3x3::GetJitConstants(
85                                     const convolution_params& params,
86                                     const DispatchData&       kd) const
87     {
88         auto mem_consts = Parent::GetJitConstants(params, kd);
89
90         const auto& input = params.inputs[0];
91         const auto& output = params.output;
92
93         const auto& iDims   = input.GetDims();
94         const auto& oDims = output.GetDims();
95         const auto& weights = params.weights;
96         const auto& wDims   = weights.GetDims();
97         const int iX  = DataTensor::Channelndex(
98                             input.GetLayout(), Tensor::DataChannelName::X);
99         const int iY  = DataTensor::Channelndex(
100                             input.GetLayout(), Tensor::DataChannelName::Y);
101         const int iB  = DataTensor::Channelndex(
102                             input.GetLayout(), Tensor::DataChannelName::BATCH);
103         const int iF  = DataTensor::Channelndex(
104                             input.GetLayout(), Tensor::DataChannelName::FEATURE);
105         const int wOD = WeightsTensor::Channelndex(
106                             weights.GetLayout(), Tensor::WeightsChannelName::OFM);
107         const int oX = DataTensor::Channelndex(
108             output.GetLayout(), Tensor::DataChannelName::X);
109         const int oY = DataTensor::Channelndex(
110             output.GetLayout(), Tensor::DataChannelName::Y);
111         mem_consts.AddConstants({
112             MakeJitConstant("_IMAD_DEFINES",   1),
113             //MakeJitConstant("SCALE_FACTOR",     m_ScaleFactor), //(255.0f / 700000.0f);
114             MakeJitConstant("_IW",              iDims[iX].v),
115             MakeJitConstant("_IH",              iDims[iY].v),
116             MakeJitConstant("_ID",              RoundUp(iDims[iF].v, 4)),
117             MakeJitConstant("IWPAD",            iDims[iX].pad.before + iDims[iX].pad.after),
118             MakeJitConstant("IHPAD",            iDims[iY].pad.before + iDims[iY].pad.after),
119             MakeJitConstant("_OW",              oDims[oX].v),
120             MakeJitConstant("_OH",              oDims[oY].v),
121             MakeJitConstant("_OD",              wDims[wOD].v),
122             MakeJitConstant("OWPAD",            oDims[oX].pad.before + oDims[oX].pad.after),
123             MakeJitConstant("OHPAD",            oDims[oY].pad.before + oDims[oY].pad.after),
124             MakeJitConstant("SIMD_SIZE",        SIMD_SIZE),
125             MakeJitConstant("K_HEIGHT",         wDims[iY].v),
126             MakeJitConstant("K_WIDTH",          wDims[iX].v),
127             MakeJitConstant("K_STRIDE",         params.stride.x), // X and Y must be equal
128             MakeJitConstant("BATCH_SIZE",       iDims[iB].v),
129             MakeJitConstant("WORKGROUP_SIZE",   "SIMD_SIZE"),
130         });
131
132         size_t obw, obh;
133         getOutBlock_WH(iDims[iX].v, params.stride.x, iDims[iX].pad.before + iDims[iX].pad.after,
134                        obw, obh);
135         mem_consts.AddConstants({
136             MakeJitConstant("OUT_BLOCK_WIDTH",  obw),
137             MakeJitConstant("OUT_BLOCK_HEIGHT", obh)
138         });
139
140         // FM_TILE definition
141         mem_consts.AddConstants({
142             MakeJitConstant("IMAD_LENGTH", 4),
143             MakeJitConstant("SYSTOLIC_DEPTH", 1),
144             MakeJitConstant("FM_TILE", "(IMAD_LENGTH * SYSTOLIC_DEPTH)")
145         });
146
147         if (input.GetDType() == Datatype::UINT8) {
148             // For unsigned types IMAD convolution kernel should skip
149             // all negative values.
150             mem_consts.AddConstants({
151                 MakeJitConstant("CONVO_UNSIGNED", 1)
152             });
153         }
154
155         if (params.output.GetLayout() != DataLayout::b_fs_yx_fsv4) {
156             mem_consts.AddConstants({
157                 // Produce unswizzelled results.
158                 MakeJitConstant("TO_UNSWIZZLE", 1),
159             });
160         }
161
162         return mem_consts;
163
164     } // GetJitConstants
165
166
167     ConvolutionKernelBase::DispatchData ConvolutionKernel_imad_3x3::SetDefault(
168                                                const convolution_params& params,
169                                                int) const
170     {
171         DispatchData kd;
172
173         const auto& in      = params.inputs[0];
174         const auto& weights = params.weights;
175         const auto& iDims   = in.GetDims();
176         const auto& wDims   = weights.GetDims();
177         const int iX  = DataTensor::Channelndex(
178                             in.GetLayout(), Tensor::DataChannelName::X);
179         const int iY  = DataTensor::Channelndex(
180                             in.GetLayout(), Tensor::DataChannelName::Y);
181         const int iB  = DataTensor::Channelndex(
182                             in.GetLayout(), Tensor::DataChannelName::BATCH);
183         const int wOD = WeightsTensor::Channelndex(
184                             weights.GetLayout(), Tensor::WeightsChannelName::OFM);
185
186         size_t otw, oth;
187         getOutBlock_WH(iDims[iX].v, params.stride.x, iDims[iX].pad.before + iDims[iX].pad.after,
188                        otw, oth);
189
190         std::vector<size_t> global = {
191             //globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW;
192             // number of tiles needed to cover output width
193             (((iDims[iX].v / params.stride.x) + (otw - 1)) / otw),
194
195             //globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH;
196             // number of tiles needed to cover output height
197             (((iDims[iY].v / params.stride.y) + (oth - 1)) / oth),
198
199             // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE);
200             // round depth range up
201             ((wDims[wOD].v * iDims[iB].v) + ((wDims[wOD].v * iDims[iB].v) % SIMD_SIZE))
202         };
203
204         std::vector<size_t> local = {1, 1, SIMD_SIZE};
205
206         kd.gws0 = global[0];
207         kd.gws1 = global[1];
208         kd.gws2 = global[2];
209
210         kd.lws0 = local[0];
211         kd.lws1 = local[1];
212         kd.lws2 = local[2];
213
214         kd.cldnnStyle = { 0 };
215         kd.gemmStyle  = { 0 };
216         kd.effiency   = FORCE_PRIORITY_1;
217
218         return kd;
219
220     } // SetDefault
221
222     bool
223     ConvolutionKernel_imad_3x3::Validate(
224             const Params&          params,
225             const optional_params& options) const
226     {
227         if (!Parent::Validate(params, options))
228         {
229             return false;
230         }
231
232         KernelData kd = KernelData::Default<convolution_params>(params);
233         convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
234
235         if (newParams.stride.x != newParams.stride.y) {
236             // Strides must be equial
237             return false;
238         }
239         else if ((newParams.filterSize.x != m_FilterSizeX) ||
240                  (newParams.filterSize.y != m_FilterSizeY)) {
241             // Kernel does not support such filter size
242             return false;
243         }
244         else {
245             const auto& in = newParams.inputs[0];
246             const auto& iDims = in.GetDims();
247             const int iX = DataTensor::Channelndex(
248                 in.GetLayout(), Tensor::DataChannelName::X);
249             if (iDims[iX].v % OUT_BLOCK_THRESHOLD != 0) {
250                 // Input size must be multiple of OUT_BLOCK_THRESHOLD
251                 return false;
252             }
253         }
254
255         return true;
256     }
257
258     KernelsData
259     ConvolutionKernel_imad_3x3::GetCommonKernelsData(
260                                 const Params&          params,
261                                 const optional_params& options,
262                                 const std::string      exeMode,
263                                 int                    autoTuneIndex) const
264     {
265         if (!Validate(params, options))
266         {
267             return{};
268         }
269
270         KernelData kd = KernelData::Default<convolution_params>(params);
271         convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
272         DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
273         if (!CheckWorkGroups(runInfo))
274         {
275             // Internal Error - wrong calculation of global/local work group sizes
276             return{};
277         }
278
279         bool succeed = UpdateWeightsParams(
280             newParams,
281             options,
282             GetSupportedWeightLayouts(newParams),
283             kd.weightsReorderParams,
284             GetSupportedKey());
285
286         if (!succeed)
287         {
288             return{};
289         }
290
291         auto finalKernelName = GetKernelName(newParams);
292         auto cldnnJit = GetJitConstants(newParams, runInfo);
293         auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, options);
294         auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint);
295
296         auto& kernel = kd.kernels[0];
297         FillCLKernelData(kernel, runInfo, params.engineInfo, finalKernelName, jit, entryPoint, exeMode, true, !newParams.bias.empty(), 1, newParams.int8_quantization, newParams.output_calibration);
298
299         kd.estimatedTime = runInfo.effiency;
300         kd.autoTuneIndex = autoTuneIndex;
301
302         return{ kd };
303
304     } // GetCommonKernelsData
305 }