2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h"
19 namespace kernel_selector
21 // Sub-group size used by "kernel_name_bfyx_os_iyx_osv16" kernel.
22 constexpr size_t sub_group_size = 16;
24 ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::ConvolutionKernel_bfyx_os_iyx_osv16_2_sg() : ConvolutionKernelBase("convolution_gpu_bfyx_os_iyx_osv16_2_sg")
26 // Generate the dispatch options to the auto-tuner.
27 std::vector<size_t> blockWidthSizes = { 1,2,4,5,6,8,10,12,14,16 };
28 std::vector<size_t> blockHeightSizes = { 1,2,3,4,5 };
29 std::vector<size_t> prefetchSizes = { 1,2,3,4,5,6,8,10 };
30 std::vector<std::string> executionModes = ConvolutionKernelBase::autoTuneOptions;
31 const size_t maxBlockSize = 60;
33 for (auto executionMode : executionModes)
35 for (auto blockWidth : blockWidthSizes)
37 for (auto blockHeight : blockHeightSizes)
39 for (auto prefetch : prefetchSizes)
41 if (blockWidth * blockHeight <= maxBlockSize)
43 autoTuneOptions.emplace_back(AutoTuneOption{ blockWidth, blockHeight, prefetch, executionMode });
51 ParamsKey ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetSupportedKey() const
54 k.EnableInputDataType(Datatype::F16);
55 k.EnableInputDataType(Datatype::F32);
56 k.EnableInputWeightsType(WeightsType::F16);
57 k.EnableInputWeightsType(WeightsType::F32);
58 k.EnableOutputDataType(Datatype::F16);
59 k.EnableOutputDataType(Datatype::F32);
60 k.EnableInputLayout(DataLayout::bfyx);
61 k.EnableOutputLayout(DataLayout::bfyx);
62 k.EnableTensorOffset();
63 k.EnableTensorPitches();
65 k.EnableBiasPerFeature();
66 k.EnableBiasPerOutput();
67 k.EnableNonBiasTerm();
69 k.EnableSplitSupport();
75 static std::pair<size_t, size_t> get_bfyx_req_input_block_dims(
76 size_t output_block_width,
77 size_t output_block_height,
78 const uSize& filter_size,
80 const uSize& dilation,
82 size_t read_chunk_size = 8,
83 size_t min_read_size = 16)
85 assert(output_block_width > 0 && output_block_height > 0);
86 assert(stride.x > 0 && stride.y > 0);
87 assert(filter_size.x > 0 && filter_size.y > 0);
89 // Number of elements in X dimension needed from input to compute output block without re-reading input.
90 size_t input_block_req_width = (output_block_width - 1) * stride.x + (filter_size.x - 1)*dilation.x + 1;
91 // Number of elements in Y dimension needed from input to compute output block without re-reading input.
92 size_t input_block_req_height = (output_block_height - 1) * stride.y + (filter_size.y - 1)*dilation.y + 1;
94 // Required number of elements in X dimension rounded to nearest >= read chunk size.
95 size_t input_block_read_width = std::max(RoundUp(input_block_req_width, read_chunk_size), min_read_size);
96 // Number of sub-group-sized vectors of unit type needed to store input block.
97 size_t input_block_array_size = CeilDiv(input_block_req_height * input_block_read_width, sg_size);
99 return std::make_pair(input_block_array_size, input_block_read_width);
102 static void shrink_blocks_to_output_size(size_t output_x, size_t output_y, size_t &block_x, size_t &block_y)
104 // how many elements we will compute in each dimension
105 size_t computed_x = Align(output_x, block_x);
106 size_t computed_y = Align(output_y, block_y);
107 // how many simds we need in each dimension
108 size_t simds_x = computed_x / block_x;
109 size_t simds_y = computed_y / block_y;
110 // how many unused values we have in each dimension
111 size_t unused_x = computed_x - output_x;
112 size_t unused_y = computed_y - output_y;
114 block_x -= unused_x / simds_x;
115 block_y -= unused_y / simds_y;
118 ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::AutoTuneOption ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetAutoTuneOptions(const Params& p, int autoTuneIndex) const
120 if ((autoTuneIndex >= 0) && (autoTuneIndex < (int)autoTuneOptions.size()))
122 return autoTuneOptions[autoTuneIndex];
125 AutoTuneOption option = { 0, 0, 0, DEFAULT };
127 const convolution_params& cp = static_cast<const convolution_params&>(p);
129 if (cp.stride.x == 1 && cp.stride.y == 1)
131 if (cp.filterSize.x == 1 && cp.filterSize.y == 1)
133 option.blockWidth = 16;
134 option.blockHeight = 1;
137 //if less than 16 values is required to compute one single row of output
138 //then each WI shall compute one single row to maximize reuse within SIMD subgroup (this gives very nice performance results)
139 else if (cp.output.X().v + (cp.filterSize.x - 1)*cp.dilation.x < sub_group_size)
141 option.blockWidth = cp.output.X().v;
142 option.blockHeight = 1;
145 else if (cp.filterSize.x < 5 && cp.filterSize.y < 5)
147 option.blockWidth = sub_group_size - cp.filterSize.x + 1;
148 option.blockHeight = 2;
153 option.blockWidth = 4;
154 option.blockHeight = 3;
158 else if (cp.stride.x == 2 && cp.stride.y == 2)
160 option.blockWidth = 5;
161 option.blockHeight = 4;
166 option.blockWidth = 4;
167 option.blockHeight = 3;
169 //run_info.effiency = FORCE_PRIORITY_7; // GEMM is better
172 // if this is not 1x1 batch1 case then shrink filters, other way we're memory bound and it's best to use 16x1 block sizes
173 if (cp.filterSize.x != 1 || cp.filterSize.y != 1 || cp.output.Batch().v != 1)
175 shrink_blocks_to_output_size(cp.output.X().v, cp.output.Y().v,
176 option.blockWidth, option.blockHeight);
182 ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::SetDefault(const convolution_params& cp, int autoTuneIndex) const
184 DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
186 const auto of_maps = cp.output.Feature().v;
187 const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
189 runInfo.effiency = FORCE_PRIORITY_3;
191 auto tuneOptions = GetAutoTuneOptions(cp, autoTuneIndex);
192 runInfo.cldnnStyle.blockWidth = tuneOptions.blockWidth;
193 runInfo.cldnnStyle.blockHeight = tuneOptions.blockHeight;
194 runInfo.cldnnStyle.prefetch = tuneOptions.prefetch;
196 auto input_block_dims = get_bfyx_req_input_block_dims(
197 runInfo.cldnnStyle.blockWidth,
198 runInfo.cldnnStyle.blockHeight,
203 runInfo.fp16UnitUsed ? sub_group_size : sub_group_size / 2,
205 runInfo.cldnnStyle.inputBlockArraySize = input_block_dims.first;
206 runInfo.cldnnStyle.inputBlockWidth = input_block_dims.second;
208 runInfo.gws0 = CeilDiv(cp.output.X().v, runInfo.cldnnStyle.blockWidth);
209 runInfo.gws1 = CeilDiv(cp.output.Y().v, runInfo.cldnnStyle.blockHeight);
210 runInfo.gws2 = 2 * of_threads_per_batch * cp.output.Batch().v;
214 runInfo.lws2 = 2*sub_group_size;
219 bool ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::Validate(const Params& p, const optional_params& o) const
221 if (!ConvolutionKernelBase::Validate(p, o) ||
222 !CovolutionCheckInput(p, o))
227 const convolution_params& cp = static_cast<const convolution_params&>(p);
229 if (cp.inputs[0].Feature().v % 2 != 0 || cp.inputs[0].Feature().v < 64)
232 if (cp.output.Feature().v % 64 != 0)
238 JitConstants ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const
240 const auto of_maps = params.output.Feature().v;
241 const size_t of_threads_per_batch = RoundUp(of_maps, sub_group_size);
242 size_t leftovers = of_threads_per_batch - of_maps;
244 auto jit = Parent::GetJitConstants(params, runInfo);
246 jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", 16));
247 jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_WIDTH", runInfo.cldnnStyle.blockWidth));
248 jit.AddConstant(MakeJitConstant("OUTPUT_BLOCK_HEIGHT", runInfo.cldnnStyle.blockHeight));
249 jit.AddConstant(MakeJitConstant("IN_BLOCK_ARRAY_SIZE", runInfo.cldnnStyle.inputBlockArraySize));
250 jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", runInfo.cldnnStyle.inputBlockWidth));
251 jit.AddConstant(MakeJitConstant("PREFETCH", runInfo.cldnnStyle.prefetch));
255 jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers));
261 std::vector<WeightsLayout> ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetSupportedWeightLayouts(const convolution_params& params) const
263 if (!params.transposed)
265 return{ WeightsLayout::os_iyx_osv16 };
269 return{ WeightsLayout::os_iyx_osv16_rotate_180 };
273 KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsData(const Params& params, const optional_params& options) const
275 return GetTunedKernelsDataByIndex(params, options);
278 KernelsData ConvolutionKernel_bfyx_os_iyx_osv16_2_sg::GetKernelsDataForAutoTune(const Params& params, const optional_params& options) const
280 if (!Validate(params, options))
285 KernelsData res = {};
287 for (size_t i = 0; i < autoTuneOptions.size(); i++)
289 KernelsData kd = GetTunedKernelsDataByIndex(params, options, (int)i);
292 res.emplace_back(kd[0]);