2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "convolution_kernel_yxfb_yxio_b16.h"
19 namespace kernel_selector
22 ParamsKey ConvolutionKernel_yxfb_yxio_b16::GetSupportedKey() const
25 k.EnableInputDataType(Datatype::F16);
26 k.EnableInputDataType(Datatype::F32);
27 k.EnableInputWeightsType(WeightsType::F16);
28 k.EnableInputWeightsType(WeightsType::F32);
29 k.EnableOutputDataType(Datatype::F16);
30 k.EnableOutputDataType(Datatype::F32);
31 k.EnableInputLayout(DataLayout::yxfb);
32 k.EnableOutputLayout(DataLayout::yxfb);
33 k.EnableTensorOffset();
34 k.EnableTensorPitches();
35 k.EnableBiasPerFeature();
36 k.EnableNonBiasTerm();
38 k.EnableSplitSupport();
44 std::string ConvolutionKernel_yxfb_yxio_b16::GetKernelName(const convolution_params& params) const
46 if (params.inputs[0].GetDType() == Datatype::F32)
48 return kernelName + "_fp32";
52 return kernelName + "_fp16";
57 // how many batches will a single work item compute
58 size_t GetBatchesPerWorkItem(size_t batch_size, Datatype dataType)
60 if (dataType == Datatype::F16)
62 const uint32_t min_batches_per_wi = 1;
63 const uint32_t min_lws = 16;
65 if (batch_size % (4 * min_batches_per_wi * min_lws) == 0)
67 return 4 * min_batches_per_wi; // USE_BLOCK_READ_2 + as_half4
69 else if (batch_size % (2 * min_batches_per_wi * min_lws) == 0)
71 return 2 * min_batches_per_wi; // USE_BLOCK_READ_1 + as_half2
75 return min_batches_per_wi;
84 size_t GetOfmPerWorkitem(Datatype dataType)
86 if (dataType == Datatype::F16)
92 ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b16::SetDefault(const convolution_params& arg, int) const
94 DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg);
96 const auto filter_ofm_num = arg.weights.OFM().v;
97 const auto batch_size = arg.output.Batch().v;
98 const uint32_t min_lws = 16;
100 const size_t batchesPerWorkItem = GetBatchesPerWorkItem(batch_size, arg.inputs[0].GetDType());
101 const size_t ofmPerWorkItem = GetOfmPerWorkitem(arg.inputs[0].GetDType());
103 if (arg.inputs[0].GetDType() == Datatype::F16)
105 runInfo.effiency = FORCE_PRIORITY_7;
109 runInfo.effiency = FORCE_PRIORITY_9;
112 runInfo.lws0 = min_lws;
113 runInfo.gws0 = filter_ofm_num * batch_size / (ofmPerWorkItem * batchesPerWorkItem);
118 bool ConvolutionKernel_yxfb_yxio_b16::Validate(const Params& p, const optional_params& o) const
120 if (!ConvolutionKernelBase::Validate(p, o))
124 const convolution_params& params = static_cast<const convolution_params&>(p);
126 const auto filter_ofm_num = params.weights.OFM().v;
127 const auto batch_size = params.output.Batch().v;
128 const uint32_t min_lws = 16;
130 const bool bInputValidated =
131 (filter_ofm_num > 0) &&
133 (params.output.Feature().v == filter_ofm_num);
135 if (!bInputValidated)
140 if (params.inputs[0].GetDType() == Datatype::F16)
142 const uint32_t min_ofm_per_wi = 16;
143 const uint32_t min_batches_per_wi = 1;
145 const bool bFilterOK = filter_ofm_num % min_ofm_per_wi == 0; // Number of output features dividable by minimum number of output features processed inside work item.
146 const bool bBatchOK = batch_size % (min_batches_per_wi * min_lws) == 0; // Batch size dividable by minimum number of batches processed when smallest local work size is used.
148 if (!bFilterOK || !bBatchOK)
155 if ((filter_ofm_num * batch_size) % min_lws != 0 ||
156 batch_size < 32) // TODO: check why it's not supported
165 JitConstants ConvolutionKernel_yxfb_yxio_b16::GetJitConstants(const convolution_params& params, const DispatchData& kd) const
167 auto jit = Parent::GetJitConstants(params, kd);
169 const auto local_work_group_size = kd.lws0;
170 const auto batch_size = params.output.Batch().v;
172 if (params.inputs[0].GetDType() == Datatype::F32)
174 // A LITTLE HACK, for convolutions with low number of input features don't use block reads, and it will speed up by 25%
175 // TODO - investigate why is this happening
176 if (params.inputs[0].Feature().v > 4)
178 jit.AddConstant(MakeJitConstant("USE_BLOCK_READ_2", ""));
183 const auto batch_pad_before = params.output.Batch().pad.before;
184 const auto feature_pitch = params.output.Feature().pitch;
186 if (batch_size >= 64 && (feature_pitch % 2 == 0) && (batch_pad_before % 2 == 0))
188 jit.AddConstant(MakeJitConstant("USE_BLOCK_READ_2", ""));
190 else if (batch_size >= 32 && (feature_pitch % 2 == 0) && (batch_pad_before % 2 == 0))
192 jit.AddConstant(MakeJitConstant("USE_BLOCK_READ_1", ""));
196 const size_t batchesPerWorkItem = GetBatchesPerWorkItem(batch_size, params.inputs[0].GetDType());
197 const size_t ofmPerWorkItem = GetOfmPerWorkitem(params.inputs[0].GetDType());
200 MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0),
201 MakeJitConstant("OFM_PER_WORK_ITEM", ofmPerWorkItem),
202 MakeJitConstant("BATCHES_PER_WORK_ITEM", batchesPerWorkItem), // how many batches will a single work item compute
203 MakeJitConstant("LOCAL_WORK_GROUPS_PER_SINGLE_BATCHES_ELEMENTS", std::max(batch_size / batchesPerWorkItem / local_work_group_size, static_cast<size_t>(1))), // how many local work groups we need to compute single element for each batch
204 MakeJitConstant("WORK_ITEMS_PER_SINGLE_BATCHES_ELEMENTS", batch_size / batchesPerWorkItem), // how many work items we need to compute single element for each batch
210 KernelsData ConvolutionKernel_yxfb_yxio_b16::GetKernelsData(const Params& params, const optional_params& options) const
212 return GetTunedKernelsDataByIndex(params, options);