2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "convolution_kernel_yxfb_yxio_b1_block_multiple_x.h"
19 namespace kernel_selector
22 constexpr size_t local_work_size = 16;
24 ParamsKey ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetSupportedKey() const
27 k.EnableInputDataType(Datatype::F32);
28 k.EnableInputWeightsType(WeightsType::F16);
29 k.EnableInputWeightsType(WeightsType::F32);
30 k.EnableOutputDataType(Datatype::F32);
31 k.EnableInputLayout(DataLayout::yxfb);
32 k.EnableOutputLayout(DataLayout::yxfb);
33 k.EnableTensorOffset();
34 k.EnableTensorPitches();
35 k.EnableBiasPerFeature();
36 k.EnableNonBiasTerm();
38 k.EnableSplitSupport();
45 size_t GetOfmPerWorkitem(size_t filter_ofm_num, size_t localWorkSize)
47 if (filter_ofm_num % (localWorkSize * 4) == 0)
49 if (filter_ofm_num % (localWorkSize * 2) == 0)
55 ConvolutionKernelBase::DispatchData ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::SetDefault(const convolution_params& arg, int autoTuneIndex) const
57 DispatchData runInfo = ConvolutionKernelBase::SetDefault(arg, autoTuneIndex);
59 const auto filter_ofm_num = arg.weights.OFM().v;
60 const auto batch_size = arg.output.Batch().v;
62 runInfo.lws0 = local_work_size;
64 // We cannot return 8 because we are processing 4 spatial coordinates for batch1,
65 // and if we use more than 4 ofm_per_work_item we downgrade simd16 to simd8 which would break this algorithm.
66 // NOTE: We could return 8 but then we must process only 2 coordinates, which is slower than processing 4 coordinates using blockread4
67 // TODO: experiment with SIMD8 version of algorithm and check if it could be faster
68 /*if (output_feature_count % (lws * 8) == 0)
70 run_info.ofm_per_work_item = 8;
71 run_info.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(run_info.gws1) / 2.0f));
74 const size_t ofmPerWorkItem = GetOfmPerWorkitem(filter_ofm_num, local_work_size);
75 if (ofmPerWorkItem == 4)
77 // We compute multiple spatial coordinates "x" in a single workitem that's why we must divide
78 runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 4.0f));
80 else if (ofmPerWorkItem == 2)
82 runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 8.0f));
86 runInfo.gws1 = static_cast<size_t>(std::ceil(static_cast<float>(runInfo.gws1) / 8.0f));
89 runInfo.gws0 = filter_ofm_num * batch_size / ofmPerWorkItem;
94 JitConstants ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetJitConstants(const convolution_params& params, const DispatchData& kd) const
96 auto cldnn_jit = ConvolutionKernelBase::GetJitConstants(params, kd);
98 size_t ofmPerWorkItem = GetOfmPerWorkitem(params.weights.OFM().v, local_work_size);
99 cldnn_jit.AddConstant(MakeJitConstant("USE_VECTOR", ofmPerWorkItem));
100 if (ofmPerWorkItem == 8)
102 cldnn_jit.AddConstant(MakeJitConstant("X_PER_WORK_ITEM", 2));
104 else if (ofmPerWorkItem == 4)
106 cldnn_jit.AddConstant(MakeJitConstant("X_PER_WORK_ITEM", 4));
110 cldnn_jit.AddConstant(MakeJitConstant("X_PER_WORK_ITEM", 8));
113 cldnn_jit.AddConstant(MakeJitConstant("OFM_PER_WORK_ITEM", ofmPerWorkItem)); // how many output feature maps for a single batch will a single work item produce
114 cldnn_jit.AddConstant(MakeJitConstant("LOCAL_WORK_GROUP_SIZE", kd.lws0));
118 bool ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::Validate(const Params& p, const optional_params& o) const
120 if (!ConvolutionKernelBase::Validate(p, o))
125 const convolution_params& params = static_cast<const convolution_params&>(p);
127 if (!CheckPitchForSplitOnly(params))
132 const auto filter_ofm_num = params.weights.OFM().v;
133 const auto batch_size = params.output.Batch().v;
135 const bool bInputValidated =
136 (filter_ofm_num > 0) &&
137 (batch_size == 1) && // current implementation doesn't support batching
138 // (subgorup is along batch*ofm and trying to block read filter/bias along batch and filter doesn't contain batching).
139 (params.output.Feature().v == filter_ofm_num);
141 if (!bInputValidated)
146 if ((filter_ofm_num * batch_size) % 16 != 0)
154 KernelsData ConvolutionKernel_yxfb_yxio_b1_block_mulitple_x::GetKernelsData(const Params& params, const optional_params& options) const
156 return GetTunedKernelsDataByIndex(params, options);