Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / kernel_selector / core / actual_kernels / fused_conv_eltwise / fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.cpp
1 /*
2 // Copyright (c) 2018 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "fused_conv_eltwise_kernel_bfyx_os_iyx_osv16.h"
18
19 namespace kernel_selector 
20 {
21     // Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel.
22     constexpr size_t sub_group_size = 16;
23
24     fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::fused_conv_eltwise_kernel_bfyx_os_iyx_osv16() : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_bfyx_os_iyx_osv16")
25     {
26         // Generate the dispatch options to the auto-tuner.
27         std::vector<size_t> blockWidthSizes = { 1,2,4,5,6,8,10,12,14,16 };
28         std::vector<size_t> blockHeightSizes = { 1,2,3,4,5 };
29         std::vector<size_t> prefetchSizes = { 1,2,3,4,5,6,8,10 };
30         std::vector<std::string> executionModes = fused_conv_eltwise_kernel_base::autoTuneOptions;
31         const size_t maxBlockSize = 60;
32
33         for (auto executionMode : executionModes)
34         {
35             for (auto blockWidth : blockWidthSizes)
36             {
37                 for (auto blockHeight : blockHeightSizes)
38                 {
39                     for (auto prefetch : prefetchSizes)
40                     {
41                             if (blockWidth * blockHeight <= maxBlockSize)
42                             {
43                                 autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode });
44                             }
45                     }
46                 }
47             }
48         }
49     }
50
51     ParamsKey fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetSupportedKey() const
52     {
53         ParamsKey k;
54         k.EnableInputDataType(Datatype::F16);
55         k.EnableInputDataType(Datatype::F32);
56         k.EnableInputWeightsType(WeightsType::F16);
57         k.EnableInputWeightsType(WeightsType::F32);
58         k.EnableOutputDataType(Datatype::F16);
59         k.EnableOutputDataType(Datatype::F32);
60         k.EnableInputLayout(DataLayout::bfyx);
61         k.EnableOutputLayout(DataLayout::bfyx);
62         k.EnableTensorOffset();
63         k.EnableTensorPitches();
64         k.EnableSubGroup();
65         k.EnableBiasPerFeature();
66         k.EnableBiasPerOutput();
67         k.EnableNonBiasTerm();
68         k.EnableBatching();
69         k.EnableFusedConvEltwSplitSupport();
70         k.EnableFusedConvEltwDilation();
71         k.EnableFusedConvEltwTranspose();
72         k.EnableFusedConvEltwiseRWOutOpt(); // data for second input are already in output
73         return k;
74     }
75
76     static std::pair<size_t, size_t> get_bfyx_req_input_block_dims(
77         size_t output_block_width,
78         size_t output_block_height,
79         const uSize& filter_size,
80         const uSize& stride,
81         const uSize& dilation,
82         size_t sg_size = 16,
83         size_t read_chunk_size = 8,
84         size_t min_read_size = 16)
85     {
86         assert(output_block_width > 0 && output_block_height > 0);
87         assert(stride.x > 0 && stride.y > 0);
88         assert(filter_size.x > 0 && filter_size.y > 0);
89
90         // Number of elements in X dimension needed from input to compute output block without re-reading input.
91         size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1)*dilation.x + 1;
92         // Number of elements in Y dimension needed from input to compute output block without re-reading input.
93         size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1)*dilation.y + 1;
94
95         // Required number of elements in X dimension rounded to nearest >= read chunk size.
96         size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
97         // Number of sub-group-sized vectors of unit type needed to store input block.
98         size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sg_size);
99
100         return std::make_pair(input_block_array_size, input_block_read_width);
101     }
102
103     static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t &block_x, size_t &block_y)
104     {
105         // how many elements we will compute in each dimension
106         size_t computed_x = Align(output_x, block_x);
107         size_t computed_y = Align(output_y, block_y);
108         // how many simds we need in each dimension
109         size_t simds_x = computed_x / block_x;
110         size_t simds_y = computed_y / block_y;
111         // how many unused values we have in each dimension
112         size_t unused_x = computed_x - output_x;
113         size_t unused_y = computed_y - output_y;
114
115         block_x -= unused_x / simds_x;
116         block_y -= unused_y / simds_y;
117     }
118
119     fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::AutoTuneOption fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetAutoTuneOptions(const Params& p, int autoTuneIndex) const
120     {
121         if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size()))
122         {
123             return autoTuneOptions[autoTuneIndex];
124         }
125
126         AutoTuneOption option = { 0, 0, 0, DEFAULT };
127
128         const convolution_params& cp = static_cast<const convolution_params&>(p);
129
130         if (cp.stride.x == 1 && cp.stride.y == 1)
131         {
132             if (cp.filterSize.x == 1 && cp.filterSize.y == 1)
133             {
134                 option.blockWidth = 16;
135                 option.blockHeight = 1;
136                 option.prefetch = 4;
137             }
138             //if less than 16 values is required to compute one single row of output
139             //then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice performance results)
140             else if (cp.output.X().v + (cp.filterSize.x - 1)*cp.dilation.x < sub_group_size)
141             {
142                 option.blockWidth = cp.output.X().v;
143                 option.blockHeight = 1;
144                 option.prefetch = 4;
145             }
146             else if (cp.filterSize.x < 5 && cp.filterSize.y < 5)
147             {
148                 option.blockWidth = sub_group_size - cp.filterSize.x + 1;
149                 option.blockHeight = 2;
150                 option.prefetch = 4;
151             }
152             else
153             {
154                 option.blockWidth = 4;
155                 option.blockHeight = 3;
156                 option.prefetch = 4;
157             }
158         }
159         else if (cp.stride.x == 2 && cp.stride.y == 2)
160         {
161             option.blockWidth = 5;
162             option.blockHeight = 4;
163             option.prefetch = 4;
164         }
165         else
166         {
167             option.blockWidth = 4;
168             option.blockHeight = 3;
169             option.prefetch = 5;
170             //run_info.effiency = FORCE_PRIORITY_7; // GEMM is better
171         }
172
173         // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block sizes
174         if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1)
175         {
176             shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v,
177                 option.blockWidth, option.blockHeight);
178         }
179
180         return option;
181     }
182
183     fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::SetDefault(const fused_conv_eltwise_params& cp, int autoTuneIndex) const
184     {
185         DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(cp);
186
187         const auto of_maps = cp.output.Feature().v;
188         const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
189
190         runInfo.effiency = FORCE_PRIORITY_3;
191
192         auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
193         runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
194         runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
195         runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
196
197         auto input_block_dims = get_bfyx_req_input_block_dims(
198             runInfo.cldnnStyle.blockWidth,
199             runInfo.cldnnStyle.blockHeight,
200             cp.conv.filterSize,
201             cp.conv.stride,
202             cp.conv.dilation,
203             sub_group_size,
204             runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
205             sub_group_size);
206         runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
207         runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
208
209         runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
210         runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
211         runInfo.gws2 = of_threads_per_batch * cp.output.Batch().v;
212
213         runInfo.lws0 = 1;
214         runInfo.lws1 = 1;
215         runInfo.lws2 = sub_group_size;
216
217         return runInfo;
218     }
219
220     bool fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::Validate(const Params& p, const optional_params& o) const
221     {
222         if (!fused_conv_eltwise_kernel_base::Validate(p, o) ||
223             !FusedConvolutionEltwiseCheckInput(p, o))
224         {
225             return false;
226         }
227
228         return true;
229     }
230
231     JitConstants fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& runInfo) const
232     {
233         const auto of_maps = params.output.Feature().v;
234         const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
235         size_t leftovers = of_threads_per_batch - of_maps;
236
237         auto jit = Parent::GetJitConstants(params, runInfo);
238
239         jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
240         jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
241         jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
242         jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
243         jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
244         jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
245
246         if (leftovers)
247         {
248             jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers));
249         }
250
251         if (!params.eltw.stride.empty())
252         {
253             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", params.eltw.stride[0].x));
254             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", params.eltw.stride[0].y));
255         }
256         else
257         {
258             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_X", 1));
259             jit.AddConstant(MakeJitConstant("ELTW_STRIDE_Y", 1));
260         }
261
262         return jit;
263     }
264
265     std::vector<WeightsLayout> fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetSupportedWeightLayouts(const fused_conv_eltwise_params& params) const
266     {
267         if (!params.conv.transposed)
268         {
269             return{ WeightsLayout::os_iyx_osv16 };
270         }
271         else
272         {
273             return{ WeightsLayout::os_iyx_osv16_rotate_180 };
274         }
275     }
276
277     KernelsData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetKernelsData(const Params& params, const optional_params& options) const
278     {
279         return GetTunedKernelsDataByIndex(params, options);
280     }
281
282     KernelsData fused_conv_eltwise_kernel_bfyx_os_iyx_osv16::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const
283     {
284         if (!Validate(params, options))
285         {
286             return{};
287         }
288
289         KernelsData res = {};
290
291         for (size_t i = 0; i < autoTuneOptions.size(); i++)
292         {
293             KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i);
294             if (!kd.empty())
295             {
296                 res.emplace_back(kd[0]);
297             }
298         }
299
300         return res;
301     }
302
303 }